diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5913f976ddbd1..c2c35d2ff3b8e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -16,6 +16,7 @@ /amd/device-libs/ @b-sumner @lamb-j /amd/hipcc/ @david-salinas @lamb-j +/libc/ @llbm/reviewers-libc /libcxx/ @llvm/reviewers-libcxx /libcxxabi/ @llvm/reviewers-libcxxabi /libunwind/ @llvm/reviewers-libunwind diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index f7c9e31915d74..d3ff37aa1a801 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -234,21 +234,9 @@ class DataAggregator : public DataReader { /// Return a vector of offsets corresponding to a trace in a function /// if the trace is valid, std::nullopt otherwise. std::optional, 16>> - getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count, + getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, bool IsReturn) const; - /// Record external entry into the function \p BF. - /// - /// Return true if the entry is valid, false otherwise. - bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred, - uint64_t Count = 1) const; - - /// Record exit from the function \p BF via a call or return. - /// - /// Return true if the exit point is valid, false otherwise. - bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred, - uint64_t Count = 1) const; - /// Branch stacks aggregation statistics uint64_t NumTraces{0}; uint64_t NumInvalidTraces{0}; diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp index ef7ba54ff6ddc..072274f2578bb 100644 --- a/bolt/lib/Core/DIEBuilder.cpp +++ b/bolt/lib/Core/DIEBuilder.cpp @@ -705,7 +705,8 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data, Description.Op[0] == Encoding::BaseTypeRef) || (Description.Op.size() == 2 && Description.Op[1] == Encoding::BaseTypeRef && - Description.Op[0] != Encoding::Size1)) + Description.Op[0] != Encoding::Size1 && + Description.Op[0] != Encoding::SizeLEB)) BC.outs() << "BOLT-WARNING: [internal-dwarf-error]: unsupported DW_OP " "encoding.\n"; @@ -713,9 +714,8 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data, Description.Op[0] == Encoding::BaseTypeRef) || (Description.Op.size() == 2 && Description.Op[1] == Encoding::BaseTypeRef && - Description.Op[0] == Encoding::Size1)) { - // This code assumes that the other non-typeref operand fits into 1 - // byte. + (Description.Op[0] == Encoding::Size1 || + Description.Op[0] == Encoding::SizeLEB))) { assert(OpOffset < Op.getEndOffset()); const uint32_t ULEBsize = Op.getEndOffset() - OpOffset - 1; (void)ULEBsize; @@ -727,7 +727,9 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data, if (Description.Op.size() == 1) { RefOffset = Op.getRawOperand(0); } else { - OutputBuffer.push_back(Op.getRawOperand(0)); + const StringRef FirstOpBytes = + Data.getData().slice(OpOffset + 1, Op.getOperandEndOffset(0)); + OutputBuffer.append(FirstOpBytes.begin(), FirstOpBytes.end()); RefOffset = Op.getRawOperand(1); } uint32_t Offset = 0; @@ -903,6 +905,7 @@ void DIEBuilder::cloneAttribute( case dwarf::DW_FORM_ref2: case dwarf::DW_FORM_ref4: case dwarf::DW_FORM_ref8: + case dwarf::DW_FORM_ref_udata: cloneDieOffsetReferenceAttribute(Die, U, InputDIE, AttrSpec, Val.getUnit()->getOffset() + *Val.getAsRelativeReference()); diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 344682f9ae2f4..b4e4417306a38 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -717,10 +717,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { Error DataAggregator::readProfile(BinaryContext &BC) { processProfile(BC); - for (auto &BFI : BC.getBinaryFunctions()) { - BinaryFunction &Function = BFI.second; - convertBranchData(Function); - } + if (Error E = DataReader::readProfile(BC)) + return E; if (opts::AggregateOnly) { if (opts::ProfileFormat == opts::ProfileFormatKind::PF_Fdata) @@ -747,6 +745,12 @@ bool DataAggregator::mayHaveProfileData(const BinaryFunction &Function) { } void DataAggregator::processProfile(BinaryContext &BC) { + // Set for DataReader::readProfile + NoLBRMode = opts::BasicAggregation; + + // Set for DataReader::recordBranch and evaluateProfileData + BATMode = usesBAT(); + if (opts::BasicAggregation) processBasicEvents(); else @@ -772,6 +776,9 @@ void DataAggregator::processProfile(BinaryContext &BC) { llvm::stable_sort(FuncBranches.second.EntryData); } + for (auto &FuncBasicSamples : NamesToBasicSamples) + llvm::stable_sort(FuncBasicSamples.second.Data); + for (auto &MemEvents : NamesToMemEvents) llvm::stable_sort(MemEvents.second.Data); @@ -880,8 +887,6 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, FromAggrData->Name = SrcFunc; setBranchData(*FromFunc, FromAggrData); } - - recordExit(*FromFunc, From, Mispreds, Count); } if (ToFunc) { DstFunc = getLocationName(*ToFunc, BAT); @@ -891,8 +896,6 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, ToAggrData->Name = DstFunc; setBranchData(*ToFunc, ToAggrData); } - - recordEntry(*ToFunc, To, Mispreds, Count); } if (FromAggrData) @@ -941,10 +944,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, return false; // Treat recursive control transfers as inter-branches. - if (FromFunc == ToFunc && To != 0) { - recordBranch(*FromFunc, From, To, Count, Mispreds); + if (FromFunc == ToFunc && To != 0) return doIntraBranch(*FromFunc, From, To, Count, Mispreds); - } return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds); } @@ -976,7 +977,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count, std::optional FTs = BAT && BAT->isBATFunction(FuncAddress) ? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To) - : getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn); + : getFallthroughsInTrace(*FromFunc, Trace, IsReturn); if (!FTs) { LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n'); NumInvalidTraces += Count; @@ -993,7 +994,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count, std::optional, 16>> DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, - uint64_t Count, bool IsReturn) const { + bool IsReturn) const { SmallVector, 16> Branches; BinaryContext &BC = BF.getBinaryContext(); @@ -1073,53 +1074,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, BB = NextBB; } - // Record fall-through jumps - for (const auto &[FromOffset, ToOffset] : Branches) { - BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(FromOffset); - BinaryBasicBlock *ToBB = BF.getBasicBlockAtOffset(ToOffset); - assert(FromBB && ToBB); - BinaryBasicBlock::BinaryBranchInfo &BI = FromBB->getBranchInfo(*ToBB); - BI.Count += Count; - } - return Branches; } -bool DataAggregator::recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred, - uint64_t Count) const { - if (To > BF.getSize()) - return false; - - if (!BF.hasProfile()) - BF.ExecutionCount = 0; - - BinaryBasicBlock *EntryBB = nullptr; - if (To == 0) { - BF.ExecutionCount += Count; - if (!BF.empty()) - EntryBB = &BF.front(); - } else if (BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(To)) { - if (BB->isEntryPoint()) - EntryBB = BB; - } - - if (EntryBB) - EntryBB->setExecutionCount(EntryBB->getKnownExecutionCount() + Count); - - return true; -} - -bool DataAggregator::recordExit(BinaryFunction &BF, uint64_t From, bool Mispred, - uint64_t Count) const { - if (!BF.isSimple() || From > BF.getSize()) - return false; - - if (!BF.hasProfile()) - BF.ExecutionCount = 0; - - return true; -} - ErrorOr DataAggregator::parseLBREntry() { LBREntry Res; ErrorOr FromStrRes = parseString('/'); diff --git a/bolt/test/X86/dwarf5-form-ref-udata.s b/bolt/test/X86/dwarf5-form-ref-udata.s new file mode 100644 index 0000000000000..0b63a1711423d --- /dev/null +++ b/bolt/test/X86/dwarf5-form-ref-udata.s @@ -0,0 +1,70 @@ +# REQUIRES: system-linux + +# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t.o +# RUN: %clang %cflags -dwarf-5 %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections 2>&1 | \ +# RUN: FileCheck %s --check-prefix CHECK-BOLT +# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck %s + +## Verify BOLT preserves DW_FORM_ref_udata (CU-relative ULEB128 DIE reference), +## a form GCC may emit instead of DW_FORM_ref4. + +# CHECK: DW_TAG_subprogram +# CHECK: DW_AT_type [DW_FORM_ref_udata] +# CHECK-SAME: "int" + +# CHECK-BOLT-NOT: BOLT-WARNING + + .text + .file 0 "." "main.cpp" + .globl main +main: +.Lfunc_begin0: + .loc 0 1 0 + xorl %eax, %eax + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + +## Force relocations against .text +.reloc 0, R_X86_64_NONE + + .section .debug_abbrev,"",@progbits + .byte 1, 17, 1 # CU, has children + .byte 17, 1 # DW_AT_low_pc, DW_FORM_addr + .byte 18, 6 # DW_AT_high_pc, DW_FORM_data4 + .byte 16, 23 # DW_AT_stmt_list, DW_FORM_sec_offset + .byte 0, 0 + .byte 2, 46, 0 # subprogram, no children + .byte 17, 1 # DW_AT_low_pc, DW_FORM_addr + .byte 18, 6 # DW_AT_high_pc, DW_FORM_data4 + .byte 73, 21 # DW_AT_type, DW_FORM_ref_udata + .byte 0, 0 + .byte 3, 36, 0 # base_type, no children + .byte 3, 8 # DW_AT_name, DW_FORM_string + .byte 0, 0 + .byte 0 + + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 +.Ldebug_info_start0: + .short 5 # DWARF version + .byte 1 # DW_UT_compile + .byte 8 # Address size + .long .debug_abbrev # Abbrev offset + .byte 1 # CU + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Lline_table_start0 # DW_AT_stmt_list + .byte 2 # subprogram + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .uleb128 .Ltype_int-.Lcu_begin0 # DW_AT_type (DW_FORM_ref_udata) +.Ltype_int: + .byte 3 # base_type + .asciz "int" # DW_AT_name + .byte 0 # End children of CU +.Ldebug_info_end0: + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/X86/dwarf5-locexpr-regval-type.s b/bolt/test/X86/dwarf5-locexpr-regval-type.s new file mode 100644 index 0000000000000..f60604c33b245 --- /dev/null +++ b/bolt/test/X86/dwarf5-locexpr-regval-type.s @@ -0,0 +1,83 @@ +# REQUIRES: system-linux + +# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t.o +# RUN: %clang %cflags -dwarf-5 %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections 2>&1 | \ +# RUN: FileCheck %s --check-prefix CHECK-BOLT +# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck %s + +## Verify BOLT correctly handles DW_OP_regval_type. Its operands are +## (ULEB128 register, ULEB128 base type DIE offset). The base type +## reference must be updated when DIEs are relocated. Use a register +## number that requires multi-byte ULEB128 encoding to exercise the +## first-operand byte-copy path. + +# CHECK: DW_TAG_variable +# CHECK: DW_AT_location [DW_FORM_exprloc] +# CHECK-SAME: DW_OP_regval_type 0xc8 (0x[[#%.8x,TYPE:]] -> +# CHECK: 0x[[#TYPE]]: DW_TAG_base_type + +# CHECK-BOLT-NOT: BOLT-WARNING + + .text + .file 0 "." "main.cpp" + .globl main +main: +.Lfunc_begin0: + .loc 0 1 0 + xorl %eax, %eax + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + +## Force relocations against .text +.reloc 0, R_X86_64_NONE + + .section .debug_abbrev,"",@progbits + .byte 1, 17, 1 # CU, has children + .byte 17, 1 # DW_AT_low_pc, DW_FORM_addr + .byte 18, 6 # DW_AT_high_pc, DW_FORM_data4 + .byte 16, 23 # DW_AT_stmt_list, DW_FORM_sec_offset + .byte 0, 0 + .byte 2, 46, 1 # subprogram, has children + .byte 17, 1 # DW_AT_low_pc, DW_FORM_addr + .byte 18, 6 # DW_AT_high_pc, DW_FORM_data4 + .byte 0, 0 + .byte 3, 52, 0 # variable, no children + .byte 2, 24 # DW_AT_location, DW_FORM_exprloc + .byte 0, 0 + .byte 4, 36, 0 # base_type, no children + .byte 3, 8 # DW_AT_name, DW_FORM_string + .byte 0, 0 + .byte 0 + + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 +.Ldebug_info_start0: + .short 5 # DWARF version + .byte 1 # DW_UT_compile + .byte 8 # Address size + .long .debug_abbrev # Abbrev offset + .byte 1 # CU + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Lline_table_start0 # DW_AT_stmt_list + .byte 2 # subprogram + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 3 # variable + .byte .Lloc_end-.Lloc_start # exprloc length +.Lloc_start: + .byte 0xa5 # DW_OP_regval_type + .uleb128 200 # register 200 (multi-byte ULEB128) + .uleb128 .Ltype_int-.Lcu_begin0 # base type DIE offset +.Lloc_end: + .byte 0 # End children of subprogram +.Ltype_int: + .byte 4 # base_type + .asciz "int" # DW_AT_name + .byte 0 # End children of CU +.Ldebug_info_end0: + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/clang-tools-extra/clang-tidy/doc8.ini b/clang-tools-extra/clang-tidy/doc8.ini index 14cac344989b3..514e75ad01df5 100644 --- a/clang-tools-extra/clang-tidy/doc8.ini +++ b/clang-tools-extra/clang-tidy/doc8.ini @@ -1,2 +1,3 @@ [doc8] ignore-path = clang-tools-extra/docs/clang-tidy/Integrations.rst +ignore = D001 diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index 71189ab9de8b2..9f7da6b6f8fac 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1536,46 +1536,6 @@ FuzzyFindRequest speculativeFuzzyFindRequestForCompletion( return CachedReq; } -// This function is similar to Lexer::findNextToken(), but assumes -// that the input SourceLocation is the completion point (which is -// a case findNextToken() does not handle). -std::optional -findTokenAfterCompletionPoint(SourceLocation CompletionPoint, - const SourceManager &SM, - const LangOptions &LangOpts) { - SourceLocation Loc = CompletionPoint; - if (Loc.isMacroID()) { - if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) - return std::nullopt; - } - - // Advance to the next SourceLocation after the completion point. - // Lexer::findNextToken() would call MeasureTokenLength() here, - // which does not handle the completion point (and can't, because - // the Lexer instance it constructs internally doesn't have a - // Preprocessor and so doesn't know about the completion point). - Loc = Loc.getLocWithOffset(1); - - // Break down the source location. - std::pair LocInfo = SM.getDecomposedLoc(Loc); - - // Try to load the file buffer. - bool InvalidTemp = false; - StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); - if (InvalidTemp) - return std::nullopt; - - const char *TokenBegin = File.data() + LocInfo.second; - - // Lex from the start of the given location. - Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), - TokenBegin, File.end()); - // Find the token. - Token Tok; - TheLexer.LexFromRawLexer(Tok); - return Tok; -} - // Runs Sema-based (AST) and Index-based completion, returns merged results. // // There are a few tricky considerations: @@ -1619,6 +1579,9 @@ class CodeCompleteFlow { // location is an opening parenthesis (tok::l_paren) because this would add // extra parenthesis. tok::TokenKind NextTokenKind = tok::eof; + // End of the identifier suffix after the completion cursor. + // Shared by NextTokenKind detection and replace-range calculation. + SourceLocation IdentifierSuffixEnd; // Counters for logging. int NSema = 0, NIndex = 0, NSemaAndIndex = 0, NIdent = 0; bool Incomplete = false; // Would more be available with a higher limit? @@ -1675,11 +1638,20 @@ class CodeCompleteFlow { auto Style = getFormatStyleForFile(SemaCCInput.FileName, SemaCCInput.ParseInput.Contents, *SemaCCInput.ParseInput.TFS, false); - const auto NextToken = findTokenAfterCompletionPoint( - Recorder->CCSema->getPreprocessor().getCodeCompletionLoc(), - Recorder->CCSema->getSourceManager(), Recorder->CCSema->LangOpts); - if (NextToken) - NextTokenKind = NextToken->getKind(); + const auto &SM = Recorder->CCSema->getSourceManager(); + const LangOptions &LangOpts = Recorder->CCSema->getLangOpts(); + // Skip past the NUL byte inserted at the cursor, then scan through any + // identifier continuation characters to find where the suffix ends. + IdentifierSuffixEnd = Lexer::findEndOfIdentifierContinuation( + Recorder->CCSema->getPreprocessor() + .getCodeCompletionLoc() + .getLocWithOffset(1), + SM, LangOpts); + // Lex the token after the identifier suffix to determine NextTokenKind. + if (Token NextToken; + !Lexer::getRawToken(IdentifierSuffixEnd, NextToken, SM, LangOpts, + /*IgnoreWhiteSpace=*/true)) + NextTokenKind = NextToken.getKind(); // If preprocessor was run, inclusions from preprocessor callback should // already be added to Includes. Inserter.emplace( @@ -1696,7 +1668,6 @@ class CodeCompleteFlow { // that happens here (though the per-URI-scheme initialization is lazy). // The per-result proximity scoring is (amortized) very cheap. FileDistanceOptions ProxOpts{}; // Use defaults. - const auto &SM = Recorder->CCSema->getSourceManager(); llvm::StringMap ProxSources; auto MainFileID = Includes.getID(SM.getFileEntryForID(SM.getMainFileID())); @@ -1905,17 +1876,7 @@ class CodeCompleteFlow { // Returns the LSP position at the end of the identifier suffix after the // code completion cursor. Position getEndOfCodeCompletionReplace(const SourceManager &SM) { - const Preprocessor &PP = Recorder->CCSema->getPreprocessor(); - const LangOptions &LangOpts = Recorder->CCSema->getLangOpts(); - - // Skip past the code completion NUL byte and scan forward through - // identifier continuation characters (letters, digits, _, $, UCN, - // unicode). This handles all cases uniformly: with prefix ("vac^1abc"), - // without prefix ("vec.^asdf"), and digit-starting ("vec.^1abc"). - const SourceLocation SuffixBegin = - PP.getCodeCompletionLoc().getLocWithOffset(1); - Position End = sourceLocToPosition( - SM, Lexer::findEndOfIdentifierContinuation(SuffixBegin, SM, LangOpts)); + Position End = sourceLocToPosition(SM, IdentifierSuffixEnd); // Adjust for the NUL byte inserted at the cursor by code completion, // which inflates the column by 1. End.character--; diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index 726fee9c2f0fe..f3a432a3b2632 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -4377,15 +4377,16 @@ TEST(CompletionTest, FunctionArgsExist) { EXPECT_THAT( completions(Context + "int y = fo^(42)", {}, Opts).Completions, UnorderedElementsAre(AllOf(labeled("foo(int A)"), snippetSuffix("")))); - // FIXME(kirillbobyrev): No snippet should be produced here. - EXPECT_THAT(completions(Context + "int y = fo^o(42)", {}, Opts).Completions, - UnorderedElementsAre( - AllOf(labeled("foo(int A)"), snippetSuffix("(${1:int A})")))); + EXPECT_THAT( + completions(Context + "int y = fo^o(42)", {}, Opts).Completions, + UnorderedElementsAre(AllOf(labeled("foo(int A)"), snippetSuffix("")))); EXPECT_THAT( completions(Context + "int y = ba^", {}, Opts).Completions, UnorderedElementsAre(AllOf(labeled("bar()"), snippetSuffix("()")))); EXPECT_THAT(completions(Context + "int y = ba^()", {}, Opts).Completions, UnorderedElementsAre(AllOf(labeled("bar()"), snippetSuffix("")))); + EXPECT_THAT(completions(Context + "int y = ba^r()", {}, Opts).Completions, + UnorderedElementsAre(AllOf(labeled("bar()"), snippetSuffix("")))); EXPECT_THAT( completions(Context + "Object o = Obj^", {}, Opts).Completions, Contains(AllOf(labeled("Object(int B)"), snippetSuffix("(${1:int B})"), @@ -4408,9 +4409,17 @@ TEST(CompletionTest, FunctionArgsExist) { Contains(AllOf(labeled("Container(int Size)"), snippetSuffix(""), kind(CompletionItemKind::Constructor)))); + EXPECT_THAT( + completions(Context + "Container c = Cont^ainer()", {}, Opts).Completions, + Contains(AllOf(labeled("Container(int Size)"), + snippetSuffix("<${1:typename T}>"), + kind(CompletionItemKind::Constructor)))); EXPECT_THAT(completions(Context + "MAC^(2)", {}, Opts).Completions, Contains(AllOf(labeled("MACRO(x)"), snippetSuffix(""), kind(CompletionItemKind::Function)))); + EXPECT_THAT(completions(Context + "MAC^RO(2)", {}, Opts).Completions, + Contains(AllOf(labeled("MACRO(x)"), snippetSuffix(""), + kind(CompletionItemKind::Function)))); } TEST(CompletionTest, FunctionArgsExist_Issue1785) { diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index f852f76f5038c..7b1b7a7384b07 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -3694,6 +3694,38 @@ the configuration (without a prefix: ``Auto``). +.. _BreakBeforeReturnType: + +**BreakBeforeReturnType** (``BreakBeforeReturnTypeStyle``) :versionbadge:`clang-format 23` :ref:`¶ ` + The function declaration/definition return type breaking style to use. + Trailing return types (``auto f() -> T``) are not affected. To have + identifier macros (e.g. ``__always_inline``) treated as specifiers, + add them to ``AttributeMacros``. + + Possible values: + + * ``BBRTS_None`` (in configuration: ``None``) + Do not force a break before the return type. + + * ``BBRTS_All`` (in configuration: ``All``) + Always break before the return type. + + .. code-block:: c++ + + static inline + void f(); + + * ``BBRTS_TopLevel`` (in configuration: ``TopLevel``) + Break before the return type of top-level functions only. + + * ``BBRTS_AllDefinitions`` (in configuration: ``AllDefinitions``) + Break before the return type of function definitions only. + + * ``BBRTS_TopLevelDefinitions`` (in configuration: ``TopLevelDefinitions``) + Break before the return type of top-level definitions only. + + + .. _BreakBeforeTemplateCloser: **BreakBeforeTemplateCloser** (``Boolean``) :versionbadge:`clang-format 21` :ref:`¶ ` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5ed471f52d97f..b9937c9b691cc 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -775,6 +775,8 @@ clang-format declaration parameters. - Add ``EnumAssignments`` option to ``AlignConsecutiveAssignments`` for aligning enum assignments without affecting other assignments. +- Add ``BreakBeforeReturnType`` option to break before the function return + type. libclang -------- diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index eca3cc44c41b6..27b2d8f4a405b 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -2462,6 +2462,31 @@ struct FormatStyle { /// \version 16 BreakBeforeInlineASMColonStyle BreakBeforeInlineASMColon; + /// Different ways to break before the function return type. + enum BreakBeforeReturnTypeStyle : int8_t { + /// Do not force a break before the return type. + BBRTS_None, + /// Always break before the return type. + /// \code + /// static inline + /// void f(); + /// \endcode + BBRTS_All, + /// Break before the return type of top-level functions only. + BBRTS_TopLevel, + /// Break before the return type of function definitions only. + BBRTS_AllDefinitions, + /// Break before the return type of top-level definitions only. + BBRTS_TopLevelDefinitions, + }; + + /// The function declaration/definition return type breaking style to use. + /// Trailing return types (``auto f() -> T``) are not affected. To have + /// identifier macros (e.g. ``__always_inline``) treated as specifiers, + /// add them to ``AttributeMacros``. + /// \version 23 + BreakBeforeReturnTypeStyle BreakBeforeReturnType; + /// If ``true``, break before a template closing bracket (``>``) when there is /// a line break after the matching opening bracket (``<``). /// \code @@ -6092,6 +6117,7 @@ struct FormatStyle { BreakBeforeCloseBracketSwitch == R.BreakBeforeCloseBracketSwitch && BreakBeforeConceptDeclarations == R.BreakBeforeConceptDeclarations && BreakBeforeInlineASMColon == R.BreakBeforeInlineASMColon && + BreakBeforeReturnType == R.BreakBeforeReturnType && BreakBeforeTemplateCloser == R.BreakBeforeTemplateCloser && BreakBeforeTernaryOperators == R.BreakBeforeTernaryOperators && BreakBinaryOperations == R.BreakBinaryOperations && diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 5639034d5ae05..5202244cee2a7 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -5100,11 +5100,13 @@ class Sema final : public SemaBase { /// otherwise setting numParams to the appropriate value. bool CheckRegparmAttr(const ParsedAttr &attr, unsigned &value); - /// Create an CUDALaunchBoundsAttr attribute. + /// Create a CUDALaunchBoundsAttr attribute. By default, the function only + /// supports nvptx target architectures and skips MaxBlocks if it is previous + /// to sm_90. Use \p IgnoreArch to skip the architecture check. CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks, - Expr *MaxBlocks); + Expr *MinBlocks, Expr *MaxBlocks, + bool IgnoreArch = false); /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular /// declaration. diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 7c2729e07a0e4..d2f30a9d6562c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1999,10 +1999,29 @@ LValue CIRGenFunction::emitMaterializeTemporaryExpr( // Perform derived-to-base casts and/or field accesses, to get from the // temporary object we created (and, potentially, for which we extended // the lifetime) to the subobject we're binding the reference to. - if (!adjustments.empty()) { - cgm.errorNYI(e->getSourceRange(), - "emitMaterializeTemporaryExpr: Adjustments"); - return {}; + for (SubobjectAdjustment &adjustment : llvm::reverse(adjustments)) { + switch (adjustment.Kind) { + case SubobjectAdjustment::DerivedToBaseAdjustment: + object = + getAddressOfBaseClass(object, adjustment.DerivedToBase.DerivedClass, + adjustment.DerivedToBase.BasePath->path(), + /*nullCheckValue=*/false, e->getExprLoc()); + break; + case SubobjectAdjustment::FieldAdjustment: { + LValue lv = makeAddrLValue(object, e->getType(), AlignmentSource::Decl); + lv = emitLValueForField(lv, adjustment.Field); + assert(lv.isSimple() && + "materialized temporary field is not a simple lvalue"); + object = lv.getAddress(); + break; + } + case SubobjectAdjustment::MemberPointerAdjustment: { + mlir::Value ptr = emitScalarExpr(adjustment.Ptr.RHS); + object = emitCXXMemberDataPointerAddress( + e, object, ptr, adjustment.Ptr.MPT, /*baseInfo=*/nullptr); + break; + } + } } return makeAddrLValue(object, m->getType(), AlignmentSource::Decl); diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index d61273b24c3d3..b910ca3c8286c 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -129,6 +129,12 @@ struct LoweringPreparePass /// Get the declaration for the 'wrapper' function for a global-TLS variable. cir::FuncOp getOrCreateThreadLocalWrapper(CIRBaseBuilderTy &builder, cir::GlobalOp op); + // Function that generates the guard global variable, get-global, and 'if' + // condition for global TLS init function generation. This inserts an 'if' + // with the store at the beginning of the 'then' region, so inserts into the + // body should happen after that. + cir::IfOp buildGlobalTlsGuardCheck(CIRBaseBuilderTy &builder, + mlir::Location loc, cir::GlobalOp guard); /// Handle the dtor region by registering destructor with __cxa_atexit cir::FuncOp getOrCreateDtorFunc(CIRBaseBuilderTy &builder, cir::GlobalOp op, mlir::Region &dtorRegion, @@ -181,6 +187,10 @@ struct LoweringPreparePass /// Get or create the __init_tls function. cir::FuncOp getTlsInitFn(); + // Create the __tls_guard variable. + cir::GlobalOp createGlobalThreadLocalGuard(CIRBaseBuilderTy &builder, + mlir::Location loc); + /// Create a guard global variable for a static local. cir::GlobalOp createGuardGlobalOp(CIRBaseBuilderTy &builder, mlir::Location loc, llvm::StringRef name, @@ -377,6 +387,9 @@ struct LoweringPreparePass entryBB.getOperations().splice(entryBB.end(), dtorBlock.getOperations(), dtorBlock.begin(), std::prev(dtorBlock.end())); + // make sure we leave the insert location after the operations we just + // inserted. + builder.setInsertionPointToEnd(&entryBB); } /// Emit the guarded initialization for a static local variable. @@ -1143,20 +1156,44 @@ LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op) { // the function entry, and discard extra blocks (which contain only // unreachable terminators from EH cleanup paths). mlir::Block *entryBB = f.addEntryBlock(); + builder.setInsertionPointToStart(entryBB); + + // If this is a global TLS variable (that is, declared at namespace scope), we + // have to emit the guard variable here. + bool needsTlsGuard = op.getDynTlsRefs() && op.getDynTlsRefs()->getGuardName(); + cir::IfOp guardIf; + if (needsTlsGuard) { + guardIf = buildGlobalTlsGuardCheck( + builder, op.getLoc(), + getOrCreateStaticLocalDeclGuardAddress( + builder, op, op.getDynTlsRefs()->getGuardName().getValue(), + /*isLocalVarDecl=*/false, + /*useInt8GuardVariable=*/op.hasInternalLinkage())); + builder.setInsertionPointToEnd(&guardIf.getThenRegion().front()); + } + if (!op.getCtorRegion().empty()) { mlir::Block &block = op.getCtorRegion().front(); - entryBB->getOperations().splice(entryBB->begin(), block.getOperations(), - block.begin(), std::prev(block.end())); + mlir::Block *insertBlock = builder.getBlock(); + insertBlock->getOperations().splice(insertBlock->end(), + block.getOperations(), block.begin(), + std::prev(block.end())); } // Register the destructor call with __cxa_atexit mlir::Region &dtorRegion = op.getDtorRegion(); if (!dtorRegion.empty()) { assert(!cir::MissingFeatures::astVarDeclInterface()); - assert(!cir::MissingFeatures::opGlobalThreadLocal()); emitGlobalGuardedDtorRegion(builder, op, dtorRegion, - op.getTlsModel().has_value(), *entryBB); + op.getTlsModel().has_value(), + *builder.getBlock()); + } + + // If we're actually in the 'if' above, create a yield. + if (needsTlsGuard) { + builder.setInsertionPointToEnd(&guardIf.getThenRegion().back()); + cir::YieldOp::create(builder, op.getLoc()); } // Replace cir.yield with cir.return @@ -1710,9 +1747,56 @@ void LoweringPreparePass::buildGlobalCtorDtorList() { } } +cir::GlobalOp +LoweringPreparePass::createGlobalThreadLocalGuard(CIRBaseBuilderTy &builder, + mlir::Location loc) { + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(mlirModule.getBody()); + + // The TLS Guard is always an Int8Ty. + cir::IntType guardTy = builder.getSIntNTy(8); + auto g = cir::GlobalOp::create(builder, loc, "__tls_guard", guardTy); + g.setLinkageAttr(cir::GlobalLinkageKindAttr::get( + builder.getContext(), cir::GlobalLinkageKind::InternalLinkage)); + g.setAlignment(clang::CharUnits::One().getAsAlign().value()); + // At the moment, we only have implementation for this mode, as it is the + // default. At one point we might need to load this mode from the module. + g.setTlsModel(TLS_Model::GeneralDynamic); + g.setInitialValueAttr(cir::IntAttr::get(guardTy, 0)); + return g; +} + +cir::IfOp LoweringPreparePass::buildGlobalTlsGuardCheck( + CIRBaseBuilderTy &builder, mlir::Location loc, cir::GlobalOp guard) { + cir::GetGlobalOp getGuard = builder.createGetGlobal(guard, /*tls=*/true); + mlir::Value getGuardValue = getGuard; + + // Classic codegen always just loads the first byte of the guard instead of + // the whole thing. __tls_guard is already only 8 bits, but for the case of + // unordered TLS, it gets created as 64 bits. + if (guard.getSymType() != builder.getSIntNTy(8)) + getGuardValue = builder.createBitcast( + getGuard, cir::PointerType::get(builder.getSIntNTy(8))); + + mlir::Value guardLoad = + builder.createAlignedLoad(loc, getGuardValue, *guard.getAlignment()); + auto zero = builder.getConstantInt(loc, builder.getSIntNTy(8), 0); + cir::CmpOp compare = + builder.createCompare(loc, cir::CmpOpKind::eq, guardLoad, zero); + return cir::IfOp::create( + builder, loc, compare, + /*withElseRegion=*/false, [&](mlir::OpBuilder &, mlir::Location loc) { + // Classic codegen still does this store as a i8, but it doesn't seem + // reasonable to do an i8 store into a 64 bit value? + builder.createStore( + loc, builder.getConstantInt(loc, guard.getSymType(), 1), getGuard); + }); +} + void LoweringPreparePass::buildCXXGlobalTlsFunc() { if (globalThreadLocalInitializers.empty()) return; + // The global-ordered-init function for TLS variables just calls each of the // init-functions in order after doing a guard. @@ -1721,9 +1805,20 @@ void LoweringPreparePass::buildCXXGlobalTlsFunc() { CIRBaseBuilderTy builder(getContext()); mlir::Block *entryBB = tlsInit.addEntryBlock(); builder.setInsertionPointToStart(entryBB); - // Note: a followup patch will emit the body here correctly. + + cir::IfOp ifOperation = buildGlobalTlsGuardCheck( + builder, loc, createGlobalThreadLocalGuard(builder, loc)); + + // Emit the body of the guarded spot. + builder.setInsertionPointToEnd(&ifOperation.getThenRegion().front()); + for (cir::FuncOp initFunc : globalThreadLocalInitializers) + builder.createCallOp(loc, initFunc, {}); + cir::YieldOp::create(builder, loc); + + builder.setInsertionPointAfter(ifOperation); cir::ReturnOp::create(builder, loc); } + void LoweringPreparePass::buildCXXGlobalInitFunc() { if (dynamicInitializers.empty()) return; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 33cb98e988ad8..45fa0cf9615d4 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5252,6 +5252,30 @@ llvm::CallInst *CodeGenFunction::EmitRuntimeCall(llvm::FunctionCallee callee, return call; } +llvm::CallInst *CodeGenFunction::EmitIntrinsicCall(llvm::Intrinsic::ID ID, + const llvm::Twine &Name) { + return EmitIntrinsicCall(ID, {}, {}, Name); +} + +llvm::CallInst *CodeGenFunction::EmitIntrinsicCall(llvm::Intrinsic::ID ID, + ArrayRef Args, + const llvm::Twine &Name) { + return EmitIntrinsicCall(ID, {}, Args, Name); +} + +llvm::CallInst *CodeGenFunction::EmitIntrinsicCall(llvm::Intrinsic::ID ID, + ArrayRef Types, + ArrayRef Args, + const llvm::Twine &Name) { + llvm::Function *F = + llvm::Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID, Types); + llvm::CallInst *Call = + Builder.CreateCall(F, Args, getBundlesForFunclet(F), Name); + if (CGM.shouldEmitConvergenceTokens() && Call->isConvergent()) + return cast(addConvergenceControlToken(Call)); + return Call; +} + /// Emits a call or invoke to the given noreturn runtime function. void CodeGenFunction::EmitNoreturnRuntimeCallOrInvoke( llvm::FunctionCallee callee, ArrayRef args) { diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 82b03d7d5f069..a4cd28f97b6d6 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -172,9 +172,8 @@ static Value *handleHlslWaveActiveBallot(CodeGenFunction &CGF, if (CGF.CGM.getTarget().getTriple().isDXIL()) { // Call DXIL intrinsic: returns { i32, i32, i32, i32 } - llvm::Function *Fn = CGF.CGM.getIntrinsic(Intrinsic::dx_wave_ballot, {I32}); - - Value *StructVal = CGF.EmitRuntimeCall(Fn, Cond); + Value *StructVal = + CGF.EmitIntrinsicCall(Intrinsic::dx_wave_ballot, {I32}, {Cond}); assert(StructVal->getType() == Struct4I32 && "dx.wave.ballot must return {i32,i32,i32,i32}"); @@ -190,8 +189,7 @@ static Value *handleHlslWaveActiveBallot(CodeGenFunction &CGF, } if (CGF.CGM.getTarget().getTriple().isSPIRV()) - return CGF.EmitRuntimeCall( - CGF.CGM.getIntrinsic(Intrinsic::spv_subgroup_ballot), Cond); + return CGF.EmitIntrinsicCall(Intrinsic::spv_subgroup_ballot, {Cond}); llvm_unreachable( "WaveActiveBallot is only supported for DXIL and SPIRV targets"); @@ -1288,9 +1286,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, Intrinsic::ID IID = getPrefixCountBitsIntrinsic(getTarget().getTriple().getArch()); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), IID), ArrayRef{Op}, - "hlsl.wave.prefix.bit.count"); + return EmitIntrinsicCall(IID, ArrayRef{Op}, "hlsl.wave.prefix.bit.count"); } case Builtin::BI__builtin_hlsl_select: { Value *OpCond = EmitScalarExpr(E->getArg(0)); @@ -1335,9 +1331,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, Value *Op = EmitScalarExpr(E->getArg(0)); Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveAllEqualIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), ID, {Op->getType()}), - {Op}); + return EmitIntrinsicCall(ID, {Op->getType()}, {Op}); } case Builtin::BI__builtin_hlsl_wave_active_all_true: { Value *Op = EmitScalarExpr(E->getArg(0)); @@ -1345,8 +1339,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, "Intrinsic WaveActiveAllTrue operand must be a bool"); Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveAllTrueIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID), {Op}); + return EmitIntrinsicCall(ID, {Op}); } case Builtin::BI__builtin_hlsl_wave_active_any_true: { Value *Op = EmitScalarExpr(E->getArg(0)); @@ -1354,8 +1347,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, "Intrinsic WaveActiveAnyTrue operand must be a bool"); Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveAnyTrueIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID), {Op}); + return EmitIntrinsicCall(ID, {Op}); } case Builtin::BI__builtin_hlsl_wave_active_bit_or: { Value *Op = EmitScalarExpr(E->getArg(0)); @@ -1364,9 +1356,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, "representation"); Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveBitOrIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), ID, {Op->getType()}), - ArrayRef{Op}, "hlsl.wave.active.bit.or"); + return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op}, + "hlsl.wave.active.bit.or"); } case Builtin::BI__builtin_hlsl_wave_active_bit_xor: { Value *Op = EmitScalarExpr(E->getArg(0)); @@ -1375,9 +1366,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, "representation"); Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveBitXorIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), ID, {Op->getType()}), - ArrayRef{Op}, "hlsl.wave.active.bit.xor"); + return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op}, + "hlsl.wave.active.bit.xor"); } case Builtin::BI__builtin_hlsl_wave_active_bit_and: { Value *Op = EmitScalarExpr(E->getArg(0)); @@ -1386,9 +1376,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, "representation"); Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveBitAndIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), ID, {Op->getType()}), - ArrayRef{Op}, "hlsl.wave.active.bit.and"); + return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op}, + "hlsl.wave.active.bit.and"); } case Builtin::BI__builtin_hlsl_wave_active_ballot: { [[maybe_unused]] Value *Op = EmitScalarExpr(E->getArg(0)); @@ -1400,9 +1389,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, case Builtin::BI__builtin_hlsl_wave_active_count_bits: { Value *OpExpr = EmitScalarExpr(E->getArg(0)); Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveCountBitsIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID), - ArrayRef{OpExpr}); + return EmitIntrinsicCall(ID, ArrayRef{OpExpr}); } case Builtin::BI__builtin_hlsl_wave_active_sum: { // Due to the use of variadic arguments, explicitly retrieve argument @@ -1410,9 +1397,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, Intrinsic::ID IID = getWaveActiveSumIntrinsic( getTarget().getTriple().getArch(), E->getArg(0)->getType()); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), IID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.wave.active.sum"); + return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.wave.active.sum"); } case Builtin::BI__builtin_hlsl_wave_active_product: { // Due to the use of variadic arguments, explicitly retrieve argument @@ -1420,9 +1406,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, Intrinsic::ID IID = getWaveActiveProductIntrinsic( getTarget().getTriple().getArch(), E->getArg(0)->getType()); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), IID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.wave.active.product"); + return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.wave.active.product"); } case Builtin::BI__builtin_hlsl_wave_active_max: { // Due to the use of variadic arguments, explicitly retrieve argument @@ -1434,9 +1419,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, else IID = CGM.getHLSLRuntime().getWaveActiveMaxIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), IID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.wave.active.max"); + return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.wave.active.max"); } case Builtin::BI__builtin_hlsl_wave_active_min: { // Due to the use of variadic arguments, explicitly retrieve argument @@ -1448,9 +1432,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, else IID = CGM.getHLSLRuntime().getWaveActiveMinIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), IID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.wave.active.min"); + return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.wave.active.min"); } case Builtin::BI__builtin_hlsl_wave_get_lane_index: { // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in @@ -1458,8 +1441,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, // for the DirectX intrinsic and the demangled builtin name switch (CGM.getTarget().getTriple().getArch()) { case llvm::Triple::dxil: - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), Intrinsic::dx_wave_getlaneindex)); + return EmitIntrinsicCall(Intrinsic::dx_wave_getlaneindex); case llvm::Triple::spirv: return EmitRuntimeCall(CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, {}, false), @@ -1471,54 +1453,46 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, } case Builtin::BI__builtin_hlsl_wave_is_first_lane: { Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_wave_get_lane_count: { Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveGetLaneCountIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_wave_read_lane_at: { // Due to the use of variadic arguments we must explicitly retrieve them and // create our function type. Value *OpExpr = EmitScalarExpr(E->getArg(0)); Value *OpIndex = EmitScalarExpr(E->getArg(1)); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(), - {OpExpr->getType()}), - ArrayRef{OpExpr, OpIndex}, "hlsl.wave.readlane"); + return EmitIntrinsicCall(CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(), + {OpExpr->getType()}, ArrayRef{OpExpr, OpIndex}, + "hlsl.wave.readlane"); } case Builtin::BI__builtin_hlsl_wave_prefix_sum: { Value *OpExpr = EmitScalarExpr(E->getArg(0)); Intrinsic::ID IID = getWavePrefixSumIntrinsic( getTarget().getTriple().getArch(), E->getArg(0)->getType()); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), IID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.wave.prefix.sum"); + return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.wave.prefix.sum"); } case Builtin::BI__builtin_hlsl_wave_prefix_product: { Value *OpExpr = EmitScalarExpr(E->getArg(0)); Intrinsic::ID IID = getWavePrefixProductIntrinsic( getTarget().getTriple().getArch(), E->getArg(0)->getType()); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), IID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.wave.prefix.product"); + return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.wave.prefix.product"); } case Builtin::BI__builtin_hlsl_quad_read_across_x: { Value *OpExpr = EmitScalarExpr(E->getArg(0)); Intrinsic::ID ID = CGM.getHLSLRuntime().getQuadReadAcrossXIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), ID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.quad.read.across.x"); + return EmitIntrinsicCall(ID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.quad.read.across.x"); } case Builtin::BI__builtin_hlsl_quad_read_across_y: { Value *OpExpr = EmitScalarExpr(E->getArg(0)); Intrinsic::ID ID = CGM.getHLSLRuntime().getQuadReadAcrossYIntrinsic(); - return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( - &CGM.getModule(), ID, {OpExpr->getType()}), - ArrayRef{OpExpr}, "hlsl.quad.read.across.y"); + return EmitIntrinsicCall(ID, {OpExpr->getType()}, ArrayRef{OpExpr}, + "hlsl.quad.read.across.y"); } case Builtin::BI__builtin_hlsl_elementwise_sign: { auto *Arg0 = E->getArg(0); @@ -1576,36 +1550,30 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, return handleHlslClip(E, this); case Builtin::BI__builtin_hlsl_all_memory_barrier: { Intrinsic::ID ID = CGM.getHLSLRuntime().getAllMemoryBarrierIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_all_memory_barrier_with_group_sync: { Intrinsic::ID ID = CGM.getHLSLRuntime().getAllMemoryBarrierWithGroupSyncIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_device_memory_barrier: { Intrinsic::ID ID = CGM.getHLSLRuntime().getDeviceMemoryBarrierIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_device_memory_barrier_with_group_sync: { Intrinsic::ID ID = CGM.getHLSLRuntime().getDeviceMemoryBarrierWithGroupSyncIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_group_memory_barrier: { Intrinsic::ID ID = CGM.getHLSLRuntime().getGroupMemoryBarrierIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_group_memory_barrier_with_group_sync: { Intrinsic::ID ID = CGM.getHLSLRuntime().getGroupMemoryBarrierWithGroupSyncIntrinsic(); - return EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); + return EmitIntrinsicCall(ID); } case Builtin::BI__builtin_hlsl_elementwise_ddx_coarse: { Value *Op0 = EmitScalarExpr(E->getArg(0)); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index ff9710ef007e4..e512b18a9d333 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4714,6 +4714,15 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::CallInst *EmitRuntimeCall(llvm::FunctionCallee callee, ArrayRef args, const Twine &name = ""); + llvm::CallInst *EmitIntrinsicCall(llvm::Intrinsic::ID ID, + const Twine &Name = ""); + llvm::CallInst *EmitIntrinsicCall(llvm::Intrinsic::ID ID, + ArrayRef Args, + const Twine &Name = ""); + llvm::CallInst *EmitIntrinsicCall(llvm::Intrinsic::ID ID, + ArrayRef Types, + ArrayRef Args, + const Twine &Name = ""); llvm::CallInst *EmitNounwindRuntimeCall(llvm::FunctionCallee callee, const Twine &name = ""); llvm::CallInst *EmitNounwindRuntimeCall(llvm::FunctionCallee callee, diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b0fe9637ccc9c..c03084518f085 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -2225,6 +2225,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case Builtin::BI__builtin_scalbn: return emitBinaryExpMaybeConstrainedFPBuiltin( *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp); + case AMDGPU::BI__builtin_amdgcn_permlane_bcast: + return emitBuiltinWithOneOverloadedType<3>( + *this, E, Intrinsic::amdgcn_permlane_bcast); + case AMDGPU::BI__builtin_amdgcn_permlane_up: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_permlane_up); + case AMDGPU::BI__builtin_amdgcn_permlane_down: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_permlane_down); + case AMDGPU::BI__builtin_amdgcn_permlane_xor: + return emitBuiltinWithOneOverloadedType<3>(*this, E, + Intrinsic::amdgcn_permlane_xor); default: return nullptr; } diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 485fe382bda3a..361072127f8e1 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -1579,7 +1579,7 @@ ContinuationIndenter::getNewLineColumn(const LineState &State) { // in ProtoBuf: // optional int32 b = 2 [(foo_options) = {aaaaaaaaaaaaaaaaaaa: 123, // bbbbbbbbbbbbbbbbbbbbbbbb:"baz"}]; - // For Verilog, a quote following a brace is treated as an identifier. And + // For Verilog, a quote preceding a brace is treated as an identifier. And // Both braces and colons get annotated as TT_DictLiteral. So we have to // check. if (Current.is(tok::identifier) && Current.Next && @@ -1654,7 +1654,9 @@ ContinuationIndenter::getNewLineColumn(const LineState &State) { TT_JavaAnnotation, TT_LeadingJavaAnnotation))) || (!Style.IndentWrappedFunctionNames && - NextNonComment->isOneOf(tok::kw_operator, TT_FunctionDeclarationName))) { + NextNonComment->isOneOf(tok::kw_operator, TT_FunctionDeclarationName)) || + (State.Line->ReturnTypeWrapped && PreviousNonComment && + isReturnTypePrefixSpecifier(*PreviousNonComment))) { return std::max(IndentationAndAlignment(CurrentState.LastSpace), CurrentState.Indent); } diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index ec0ad98f37753..a29d62c99bb95 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -749,6 +749,19 @@ struct ScalarEnumerationTraits { } }; +template <> +struct ScalarEnumerationTraits { + static void enumeration(IO &IO, + FormatStyle::BreakBeforeReturnTypeStyle &Value) { + IO.enumCase(Value, "None", FormatStyle::BBRTS_None); + IO.enumCase(Value, "All", FormatStyle::BBRTS_All); + IO.enumCase(Value, "TopLevel", FormatStyle::BBRTS_TopLevel); + IO.enumCase(Value, "AllDefinitions", FormatStyle::BBRTS_AllDefinitions); + IO.enumCase(Value, "TopLevelDefinitions", + FormatStyle::BBRTS_TopLevelDefinitions); + } +}; + template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, FormatStyle::SeparateDefinitionStyle &Value) { @@ -1317,6 +1330,7 @@ template <> struct MappingTraits { IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces); IO.mapOptional("BreakBeforeInlineASMColon", Style.BreakBeforeInlineASMColon); + IO.mapOptional("BreakBeforeReturnType", Style.BreakBeforeReturnType); IO.mapOptional("BreakBeforeTemplateCloser", Style.BreakBeforeTemplateCloser); IO.mapOptional("BreakBeforeTernaryOperators", @@ -1889,6 +1903,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.BreakBeforeCloseBracketSwitch = false; LLVMStyle.BreakBeforeConceptDeclarations = FormatStyle::BBCDS_Always; LLVMStyle.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline; + LLVMStyle.BreakBeforeReturnType = FormatStyle::BBRTS_None; LLVMStyle.BreakBeforeTemplateCloser = false; LLVMStyle.BreakBeforeTernaryOperators = true; LLVMStyle.BreakBinaryOperations = {FormatStyle::BBO_Never, {}}; diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 1d8f0f1cfe412..7f6721a87877a 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -2132,6 +2132,15 @@ inline bool continuesLineComment(const FormatToken &FormatTok, // Returns \c true if \c Current starts a new parameter. bool startsNextParameter(const FormatToken &Current, const FormatStyle &Style); +// Returns \c true if \c Tok is a function/storage specifier that may appear +// before a function return type (e.g. ``static``, ``inline``, ``constexpr``). +inline bool isReturnTypePrefixSpecifier(const FormatToken &Tok) { + return Tok.isOneOf(tok::kw_static, tok::kw_extern, tok::kw_inline, + tok::kw_virtual, tok::kw_constexpr, tok::kw_consteval, + tok::kw_friend, tok::kw_export, tok::kw__Noreturn, + tok::kw___forceinline); +} + } // namespace format } // namespace clang diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index afdb59617fb2a..9181e0e9d5a2a 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4062,6 +4062,67 @@ bool TokenAnnotator::mustBreakForReturnType(const AnnotatedLine &Line) const { return false; } +bool TokenAnnotator::mustBreakBeforeReturnType( + const AnnotatedLine &Line) const { + assert(Line.MightBeFunctionDecl); + + switch (Style.BreakBeforeReturnType) { + case FormatStyle::BBRTS_None: + return false; + case FormatStyle::BBRTS_All: + return true; + case FormatStyle::BBRTS_TopLevel: + return Line.Level == 0; + case FormatStyle::BBRTS_AllDefinitions: + return Line.mightBeFunctionDefinition(); + case FormatStyle::BBRTS_TopLevelDefinitions: + return Line.Level == 0 && Line.mightBeFunctionDefinition(); + } + + return false; +} + +static FormatToken *findReturnTypeStart(const AnnotatedLine &Line) { + auto *Tok = Line.getFirstNonComment(); + if (!Tok) + return nullptr; + + if (Tok->is(tok::kw_template)) { + auto *Opener = Tok->Next; + while (Opener && Opener->isNot(TT_TemplateOpener)) + Opener = Opener->Next; + if (!Opener || !Opener->MatchingParen) + return nullptr; + Tok = Opener->MatchingParen->Next; + } + + if (Tok && Tok->is(TT_RequiresClause)) { + while (Tok && !Tok->ClosesRequiresClause) + Tok = Tok->Next; + if (Tok) + Tok = Tok->Next; + } + + while (Tok) { + if (isReturnTypePrefixSpecifier(*Tok) || + Tok->isOneOf(tok::kw___attribute, tok::kw___declspec, + TT_AttributeMacro)) { + auto *Next = Tok->Next; + if (Next && Next->is(tok::l_paren) && Next->MatchingParen) + Tok = Next->MatchingParen->Next; + else + Tok = Next; + continue; + } + if (Tok->is(TT_AttributeLSquare) && Tok->MatchingParen) { + Tok = Tok->MatchingParen->Next; + continue; + } + break; + } + return Tok; +} + void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { if (Line.Computed) return; @@ -4180,6 +4241,17 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const { } } + if (Line.MightBeFunctionDecl && LineIsFunctionDeclaration && + mustBreakBeforeReturnType(Line)) { + if (auto *ReturnTypeStart = findReturnTypeStart(Line); + ReturnTypeStart && ReturnTypeStart != FirstNonComment && + ReturnTypeStart->isNoneOf(TT_FunctionDeclarationName, + TT_CtorDtorDeclName, tok::tilde)) { + ReturnTypeStart->MustBreakBefore = true; + Line.ReturnTypeWrapped = true; + } + } + if (First->is(TT_ElseLBrace)) { First->CanBreakBefore = true; First->MustBreakBefore = true; diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h index 33c7df9d0f949..52d6e5ca56915 100644 --- a/clang/lib/Format/TokenAnnotator.h +++ b/clang/lib/Format/TokenAnnotator.h @@ -256,6 +256,8 @@ class TokenAnnotator { bool mustBreakForReturnType(const AnnotatedLine &Line) const; + bool mustBreakBeforeReturnType(const AnnotatedLine &Line) const; + void printDebugInfo(const AnnotatedLine &Line) const; void calculateUnbreakableTailLengths(AnnotatedLine &Line) const; diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 022fd62ed2bfc..9536a233def58 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -1451,6 +1451,17 @@ void UnwrappedLineParser::parseStructuralElement( while (FormatTok->is(tok::l_square) && handleCppAttributes()) { } } else if (Style.isVerilog()) { + // Skip attributes. + while (FormatTok->is(tok::l_paren) && + Tokens->peekNextToken()->is(tok::star)) { + parseParens(); + } + // Skip things that can exist before keywords like 'if' and 'case'. + if (FormatTok->isOneOf(Keywords.kw_priority, Keywords.kw_unique, + Keywords.kw_unique0)) { + nextToken(); + } + if (Keywords.isVerilogStructuredProcedure(*FormatTok)) { parseForOrWhileLoop(/*HasParens=*/false); return; @@ -1464,19 +1475,6 @@ void UnwrappedLineParser::parseStructuralElement( parseIfThenElse(IfKind, /*KeepBraces=*/false, /*IsVerilogAssert=*/true); return; } - - // Skip things that can exist before keywords like 'if' and 'case'. - while (true) { - if (FormatTok->isOneOf(Keywords.kw_priority, Keywords.kw_unique, - Keywords.kw_unique0)) { - nextToken(); - } else if (FormatTok->is(tok::l_paren) && - Tokens->peekNextToken()->is(tok::star)) { - parseParens(); - } else { - break; - } - } } // Tokens that only make sense at the beginning of a line. @@ -3376,6 +3374,7 @@ void UnwrappedLineParser::parseDoWhile() { void UnwrappedLineParser::parseLabel( FormatStyle::IndentGotoLabelStyle IndentGotoLabels) { + const bool IsGotoLabel = FormatTok->is(TT_GotoLabelColon); nextToken(); unsigned OldLineLevel = Line->Level; @@ -3392,9 +3391,8 @@ void UnwrappedLineParser::parseLabel( break; } - if (!Style.IndentCaseBlocks && CommentsBeforeNextToken.empty() && - FormatTok->is(tok::l_brace)) { - + if (!IsGotoLabel && !Style.IndentCaseBlocks && + CommentsBeforeNextToken.empty() && FormatTok->is(tok::l_brace)) { CompoundStatementIndenter Indenter(this, Line->Level, Style.BraceWrapping.AfterCaseLabel, Style.BraceWrapping.IndentBraces); diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 45a47ec797f01..7f3c575fb68bb 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -3846,12 +3846,13 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool ParseOnly) { continue; case ParsedAttr::AT_CUDALaunchBounds: if (!PA.checkAtLeastNumArgs(Actions, 1) || - !PA.checkAtMostNumArgs(Actions, 2)) + !PA.checkAtMostNumArgs(Actions, 3)) continue; if (auto *A = Actions.CreateLaunchBoundsAttr( PA, PA.getArgAsExpr(0), PA.getNumArgs() > 1 ? PA.getArgAsExpr(1) : nullptr, - PA.getNumArgs() > 2 ? PA.getArgAsExpr(2) : nullptr)) + PA.getNumArgs() > 2 ? PA.getArgAsExpr(2) : nullptr, + /*IgnoreArch=*/true)) Attrs.push_back(A); continue; default: diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 55b6cbcbba57d..364f4de077ca7 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6017,7 +6017,8 @@ static Expr *makeLaunchBoundsArgExpr(Sema &S, Expr *E, CUDALaunchBoundsAttr * Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, - Expr *MinBlocks, Expr *MaxBlocks) { + Expr *MinBlocks, Expr *MaxBlocks, + bool IgnoreArch) { CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks); MaxThreads = makeLaunchBoundsArgExpr(*this, MaxThreads, TmpAttr, 0); if (!MaxThreads) @@ -6030,14 +6031,20 @@ Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads, } if (MaxBlocks) { - // '.maxclusterrank' ptx directive requires .target sm_90 or higher. - auto SM = getOffloadArch(Context.getTargetInfo()); - if (SM == OffloadArch::Unknown || SM < OffloadArch::SM_90) { - Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90) - << OffloadArchToString(SM) << CI << MaxBlocks->getSourceRange(); - // Ignore it by setting MaxBlocks to null; - MaxBlocks = nullptr; - } else { + // We might want to ignore the nvptx arch check, e.g., when processing the + // launch bounds attribute within ompx_attribute to support other archs. + if (!IgnoreArch) { + // '.maxclusterrank' ptx directive requires .target sm_90 or higher. + auto SM = getOffloadArch(Context.getTargetInfo()); + if (SM == OffloadArch::Unknown || SM < OffloadArch::SM_90) { + Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90) + << OffloadArchToString(SM) << CI << MaxBlocks->getSourceRange(); + // Ignore it by setting MaxBlocks to null; + MaxBlocks = nullptr; + } + } + + if (MaxBlocks) { MaxBlocks = makeLaunchBoundsArgExpr(*this, MaxBlocks, TmpAttr, 2); if (!MaxBlocks) return nullptr; diff --git a/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp index f471d586aa850..1359ac84b3cc3 100644 --- a/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp +++ b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp @@ -10,6 +10,7 @@ struct CtorDtor { int i; }; +// LLVM-BOTH-DAG: @__tls_guard = internal thread_local global i8 0, align 1 // LLVM-BOTH-DAG: @__dso_handle = external hidden global i8 // LLVM-BOTH-DAG: @tls_cd = thread_local global %struct.CtorDtor { i32 5 }, align 4 // LLVM-BOTH-DAG: @tls_cd_dyn = thread_local global %struct.CtorDtor zeroinitializer, align 4 @@ -22,6 +23,7 @@ struct CtorDtor { // LLVM-BOTH-DAG: @_ZTH6tls_cd = alias void (), ptr @__tls_init // Wrappers & aliases. +// CIR: cir.global internal tls_dyn @__tls_guard = #cir.int<0> : !s8i {alignment = 1 : i64} // CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW19tls_cd_dyn_not_used() -> !cir.ptr { // CIR: cir.call @_ZTH19tls_cd_dyn_not_used() : () -> () // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr @@ -47,6 +49,18 @@ struct CtorDtor { // CIR: cir.func @_ZTH6tls_cd() alias(@__tls_init) // CIR-LABEL: cir.func internal private @__tls_init() { +// CIR: %[[GET_GUARD:.*]] = cir.get_global thread_local @__tls_guard : !cir.ptr +// CIR: %[[LOAD_GUARD:.*]] = cir.load align(1) %[[GET_GUARD]] : !cir.ptr, !s8i +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i +// CIR: %[[CMP:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i +// CIR: cir.if %[[CMP]] { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s8i +// CIR: cir.store %[[ONE]], %[[GET_GUARD]] : !s8i, !cir.ptr +// CIR: cir.call @[[TLS_CD_INIT:.*]]() : () -> () +// CIR: cir.call @[[TLS_CD_DYN_INIT:.*]]() : () -> () +// CIR: cir.call @[[TLS_CD_REF_INIT:.*]]() : () -> () +// CIR: cir.call @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() : () -> () +// CIR: } // CIR: cir.return // LLVM: define weak_odr hidden ptr @_ZTW19tls_cd_dyn_not_used() { @@ -74,6 +88,16 @@ struct CtorDtor { // LLVM: } // // LLVM: define internal void @__tls_init() { +// LLVM: %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__tls_guard) +// LLVM: %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 1 +// LLVM: %[[IS_UNINIT:.*]] = icmp eq i8 %[[LOAD_GUARD]], 0 +// LLVM: br i1 %[[IS_UNINIT]] +// LLVM +// LLVM: store i8 1, ptr %[[GET_GUARD]], align 1 +// LLVM: call void @[[TLS_CD_INIT:.*]]() +// LLVM: call void @[[TLS_CD_DYN_INIT:.*]]() +// LLVM: call void @[[TLS_CD_REF_INIT:.*]]() +// LLVM: call void @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() thread_local CtorDtor tls_cd = 5; // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW6tls_cd", "_ZTH6tls_cd"> @tls_cd = #cir.const_record<{#cir.int<5> : !s32i}> : !rec_CtorDtor dtor { @@ -81,7 +105,7 @@ thread_local CtorDtor tls_cd = 5; // CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> () // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW6tls_cd", "_ZTH6tls_cd"> @tls_cd = #cir.const_record<{#cir.int<5> : !s32i}> : !rec_CtorDtor -// CIR: cir.func internal private @[[TLS_CD_INIT:.*]]() { +// CIR: cir.func internal private @[[TLS_CD_INIT]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd : !cir.ptr // CIR: %[[GET_DTOR:.*]] = cir.get_global @_ZN8CtorDtorD1Ev : !cir.ptr)>> // CIR: %[[DTOR_DECAY:.*]] = cir.cast bitcast %[[GET_DTOR]] : !cir.ptr)>> -> !cir.ptr)>> @@ -90,7 +114,7 @@ thread_local CtorDtor tls_cd = 5; // CIR: cir.call @__cxa_thread_atexit(%[[DTOR_DECAY]], %[[GLOB_DECAY]], %[[DSOHANDLE]]) : (!cir.ptr)>>, !cir.ptr, !cir.ptr) -> () // CIR: cir.return // -// LLVM: define internal void @[[TLS_CD_INIT:.*]]() { +// LLVM: define internal void @[[TLS_CD_INIT]]() { // OGCG: define internal void @[[TLS_CD_INIT:.*]]() {{.*}}{ // LLVM: %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd) // LLVM: call void @__cxa_thread_atexit(ptr @_ZN8CtorDtorD1Ev, ptr %[[GET_GLOB]], ptr @__dso_handle) @@ -107,7 +131,7 @@ thread_local CtorDtor tls_cd_dyn = get_i(); // CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> () // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_dyn", "_ZTH10tls_cd_dyn"> @tls_cd_dyn = #cir.zero : !rec_CtorDtor -// CIR: cir.func internal private @[[TLS_CD_DYN_INIT:.*]]() { +// CIR: cir.func internal private @[[TLS_CD_DYN_INIT]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr // CIR: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) // CIR: cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]]) @@ -119,7 +143,7 @@ thread_local CtorDtor tls_cd_dyn = get_i(); // CIR: cir.call @__cxa_thread_atexit(%[[DTOR_DECAY]], %[[GLOB_DECAY]], %[[DSOHANDLE]]) : (!cir.ptr)>>, !cir.ptr, !cir.ptr) -> () // CIR: cir.return // -// LLVM: define internal void @[[TLS_CD_DYN_INIT:.*]]() { +// LLVM: define internal void @[[TLS_CD_DYN_INIT]]() { // OGCG: define internal void @[[TLS_CD_DYN_INIT:.*]]() {{.*}} { // LLVM: %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_dyn) // LLVM-BOTH: %[[CALL:.*]] = call noundef i32 @_Z5get_iv() @@ -137,13 +161,13 @@ thread_local CtorDtor &tls_cd_ref = tls_cd_dyn; // CIR-BEFORE-LPP: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr> // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_ref", "_ZTH10tls_cd_ref"> @tls_cd_ref = #cir.ptr : !cir.ptr -// CIR: cir.func internal private @[[TLS_CD_REF_INIT:.*]]() { +// CIR: cir.func internal private @[[TLS_CD_REF_INIT]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_ref : !cir.ptr> // CIR: %[[GET_DYN:.*]] = cir.call @_ZTW10tls_cd_dyn() : () -> !cir.ptr // CIR: cir.store align(8) %[[GET_DYN]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr> // CIR: cir.return // -// LLVM: define internal void @[[TLS_CD_REF_INIT:.*]]() { +// LLVM: define internal void @[[TLS_CD_REF_INIT]]() { // OGCG: define internal void @[[TLS_CD_REF_INIT:.*]]() {{.*}} { // LLVM: %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_ref) // LLVM-BOTH: %[[CALL:.*]] = call ptr @_ZTW10tls_cd_dyn() @@ -167,7 +191,7 @@ thread_local CtorDtor tls_cd_dyn_not_used = get_i(); // CIR-BEFORE-LPP: cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr) -> () // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW19tls_cd_dyn_not_used", "_ZTH19tls_cd_dyn_not_used"> @tls_cd_dyn_not_used = #cir.zero : !rec_CtorDtor -// CIR: cir.func internal private @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() { +// CIR: cir.func internal private @[[TLS_CD_DYN_NOT_USED_INIT]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr // CIR: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) // CIR: cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]]) @@ -179,7 +203,7 @@ thread_local CtorDtor tls_cd_dyn_not_used = get_i(); // CIR: cir.call @__cxa_thread_atexit(%[[DTOR_DECAY]], %[[GLOB_DECAY]], %[[DSOHANDLE]]) : (!cir.ptr)>>, !cir.ptr, !cir.ptr) -> () // CIR: cir.return // -// LLVM: define internal void @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() { +// LLVM: define internal void @[[TLS_CD_DYN_NOT_USED_INIT]]() { // OGCG: define internal void @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() {{.*}} { // LLVM: %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_dyn_not_used) // LLVM-BOTH: %[[CALL:.*]] = call noundef i32 @_Z5get_iv() diff --git a/clang/test/CIR/CodeGen/global-tls-simple-init.cpp b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp index c14c21358ebff..4d480bfa64f5b 100644 --- a/clang/test/CIR/CodeGen/global-tls-simple-init.cpp +++ b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp @@ -54,12 +54,25 @@ struct CtorDtor { // Full init of all variables (func names below). // CIR-LABEL: cir.func internal private @__tls_init() { +// CIR: %[[GET_GUARD:.*]] = cir.get_global thread_local @__tls_guard : !cir.ptr +// CIR: %[[LOAD_GUARD:.*]] = cir.load align(1) %[[GET_GUARD]] : !cir.ptr, !s8i +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i +// CIR: %[[CMP:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i +// CIR: cir.if %[[CMP]] { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s8i +// CIR: cir.store %[[ONE]], %[[GET_GUARD]] : !s8i, !cir.ptr +// CIR: cir.call @[[TLS_INT_DYN_INIT:.*]]() : () -> () +// CIR: cir.call @[[TLS_INT_REF_INIT:.*]]() : () -> () +// CIR: cir.call @[[TLS_INT_SELF_REF_INIT:.*]]() : () -> () +// CIR: cir.call @[[DEF_INITED_DYN:.*]]() : () -> () +// CIR: } // CIR: cir.return // CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW7tls_int() -> !cir.ptr { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int : !cir.ptr // CIR: cir.return %[[GET_GLOB]] +// LLVM-BOTH-DAG: @__tls_guard = internal thread_local global i8 0, align 1 // LLVM-BOTH-DAG: @tls_int = thread_local global i32 5, align 4 // LLVM-BOTH-DAG: @tls_int_dyn = thread_local global i32 0, align 4 // LLVM-BOTH-DAG: @tls_int_ref = thread_local global ptr null, align 8 @@ -107,6 +120,17 @@ struct CtorDtor { // LLVM: ret ptr %[[GET_GLOB]] // LLVM: define internal void @__tls_init() { +// LLVM: %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__tls_guard) +// LLVM: %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 1 +// LLVM: %[[IS_UNINIT:.*]] = icmp eq i8 %[[LOAD_GUARD]], 0 +// LLVM: br i1 %[[IS_UNINIT]] +// +// LLVM: store i8 1, ptr %[[GET_GUARD]], align 1 +// LLVM: call void @[[TLS_INT_DYN_INIT:.*]]() +// LLVM: call void @[[TLS_INT_REF_INIT:.*]]() +// LLVM: call void @[[TLS_INT_SELF_REF_INIT:.*]]() +// LLVM: call void @[[DEF_INITED_DYN:.*]]() +// LLVM: br label // LLVM: ret void // LLVM: define weak_odr hidden ptr @_ZTW7tls_int() { @@ -126,12 +150,12 @@ thread_local int tls_int_dyn = get_i(); // CIR-BEFORE-LPP: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_dyn", "_ZTH11tls_int_dyn"> @tls_int_dyn = #cir.int<0> : !s32i -// CIR: cir.func internal private @[[TLS_INT_DYN_INIT:.*]]() { +// CIR: cir.func internal private @[[TLS_INT_DYN_INIT]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_dyn : !cir.ptr // CIR: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) // CIR: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr // CIR: cir.return -// LLVM: define internal void @[[TLS_INT_DYN_INIT:.*]]() { +// LLVM: define internal void @[[TLS_INT_DYN_INIT]]() { // OGCG: define internal void @[[TLS_INT_DYN_INIT:.*]]() // OGCG: %[[CALL:.*]] = call noundef i32 @_Z5get_iv() // LLVM-BOTH: %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_dyn) @@ -146,12 +170,12 @@ thread_local int &tls_int_ref = tls_int_dyn; // CIR-BEFORE-LPP: cir.store {{.*}}%[[GET_OTHER]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr> // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_ref", "_ZTH11tls_int_ref"> @tls_int_ref = #cir.ptr : !cir.ptr -// CIR: cir.func internal private @[[TLS_INT_REF_INIT:.*]]() { +// CIR: cir.func internal private @[[TLS_INT_REF_INIT]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_ref : !cir.ptr> // CIR: %[[GET_REF:.*]] = cir.call @_ZTW11tls_int_dyn() : () -> !cir.ptr // CIR: cir.store {{.*}}%[[GET_REF]], %[[GET_GLOB]] : !cir.ptr, !cir.ptr> // CIR: cir.return -// LLVM: define internal void @[[TLS_INT_REF_INIT:.*]]() { +// LLVM: define internal void @[[TLS_INT_REF_INIT]]() { // OGCG: define internal void @[[TLS_INT_REF_INIT:.*]]() // OGCG: %[[GET_REF:.*]] = call ptr @_ZTW11tls_int_dyn() // LLVM-BOTH: %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_ref) @@ -174,7 +198,7 @@ thread_local int tls_int_self_init = tls_int_self_init + get_i(); // CIR-BEFORE-LPP: cir.store {{.*}}%[[ADD]], %[[GET_GLOB]] : !s32i, !cir.ptr // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW17tls_int_self_init", "_ZTH17tls_int_self_init"> @tls_int_self_init = #cir.int<0> : !s32i -// CIR: cir.func internal private @[[TLS_INT_SELF_REF_INIT:.*]]() { +// CIR: cir.func internal private @[[TLS_INT_SELF_REF_INIT]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_self_init : !cir.ptr // CIR: %[[GET_SELF_FROM_WRAPPER:.*]] = cir.call @_ZTW17tls_int_self_init() : () -> !cir.ptr // CIR: %[[SELF_LOAD:.*]] = cir.load {{.*}}%[[GET_SELF_FROM_WRAPPER]] : !cir.ptr, !s32i @@ -182,7 +206,7 @@ thread_local int tls_int_self_init = tls_int_self_init + get_i(); // CIR: %[[ADD:.*]] = cir.add nsw %[[SELF_LOAD]], %[[CALL]] : !s32i // CIR: cir.store{{.*}} %[[ADD]], %[[GET_GLOB]] : !s32i, !cir.ptr // CIR: cir.return -// LLVM: define internal void @[[TLS_INT_SELF_REF_INIT:.*]]() { +// LLVM: define internal void @[[TLS_INT_SELF_REF_INIT]]() { // OGCG: define internal void @[[TLS_INT_SELF_REF_INIT:.*]]() // LLVM: %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_self_init) // LLVM-BOTH: %[[GET_SELF_FROM_WRAPPER:.*]] = call ptr @_ZTW17tls_int_self_init() @@ -209,12 +233,12 @@ extern thread_local int definitely_inited_dyn = get_i(); // CIR-BEFORE-LPP: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr // CIR-BEFORE-LPP: } // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW21definitely_inited_dyn", "_ZTH21definitely_inited_dyn"> @definitely_inited_dyn = #cir.int<0> : !s32i -// CIR: cir.func internal private @[[DEF_INITED_DYN:.*]]() { +// CIR: cir.func internal private @[[DEF_INITED_DYN]]() { // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @definitely_inited_dyn : !cir.ptr // CIR: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) // CIR: cir.store align(4) %[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr // CIR: cir.return -// LLVM: define internal void @[[DEF_INITED_DYN:.*]]() { +// LLVM: define internal void @[[DEF_INITED_DYN]]() { // OGCG: define internal void @[[DEF_INITED_DYN:.*]]() // LLVM: %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @definitely_inited_dyn) // LLVM-BOTH: %[[CALL:.*]] = call noundef i32 @_Z5get_iv() diff --git a/clang/test/CIR/CodeGen/global-tls-templates.cpp b/clang/test/CIR/CodeGen/global-tls-templates.cpp index 95cf1a26069e9..d6af8e90d2229 100644 --- a/clang/test/CIR/CodeGen/global-tls-templates.cpp +++ b/clang/test/CIR/CodeGen/global-tls-templates.cpp @@ -38,6 +38,7 @@ thread_local T tls_templ = {get_i()}; // Alias: Ctor/Dtor: // CIR: cir.func linkonce_odr @_ZTH9tls_templI8CtorDtorE() alias(@[[CTOR_DTOR_INIT:[^)]*]]) // TLS Guard: Ctor/Dtor: +// CIR: cir.global "private" linkonce_odr comdat tls_dyn @_ZGV9tls_templI8CtorDtorE = #cir.int<0> : !s64i // Wrapper: int // CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW9tls_templIiE() -> !cir.ptr @@ -48,15 +49,28 @@ thread_local T tls_templ = {get_i()}; // Alias: int // CIR: cir.func linkonce_odr @_ZTH9tls_templIiE() alias(@[[INT_INIT:[^)]*]]) +// TLS Guard: int +// CIR: cir.global "private" linkonce_odr comdat tls_dyn @_ZGV9tls_templIiE = #cir.int<0> : !s64i // Global: int // CIR: cir.global linkonce_odr comdat tls_dyn dyn_tls_refs = <"_ZTW9tls_templIiE", "_ZTH9tls_templIiE", "_ZGV9tls_templIiE"> @_Z9tls_templIiE = #cir.int<0> : !s32i // Init Func: int // CIR: cir.func internal private @[[INT_INIT]]() { +// CIR: %[[GET_GUARD:.*]] = cir.get_global thread_local @_ZGV9tls_templIiE : !cir.ptr +// CIR: %[[GUARD_CAST:.*]] = cir.cast bitcast %[[GET_GUARD]] : !cir.ptr -> !cir.ptr +// CIR: %[[LOAD_GUARD:.*]] = cir.load align(8) %[[GUARD_CAST]] : !cir.ptr, !s8i +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i +// CIR: %[[ISUNINIT:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i +// CIR: cir.if %[[ISUNINIT]] { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i +// CIR: cir.store %[[ONE]], %[[GET_GUARD]] : !s64i, !cir.ptr // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templIiE : !cir.ptr // CIR: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) // CIR: cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr +// CIR: } +// CIR: cir.return +// CIR: } // Global: Ctor/Dotr: @@ -64,6 +78,14 @@ thread_local T tls_templ = {get_i()}; // Init Func: Ctor/Dtor: // CIR: cir.func internal private @[[CTOR_DTOR_INIT]]() { +// CIR: %[[GET_GUARD:.*]] = cir.get_global thread_local @_ZGV9tls_templI8CtorDtorE : !cir.ptr +// CIR: %[[GUARD_CAST:.*]] = cir.cast bitcast %[[GET_GUARD]] : !cir.ptr -> !cir.ptr +// CIR: %[[LOAD_GUARD:.*]] = cir.load align(8) %[[GUARD_CAST]] : !cir.ptr, !s8i +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i +// CIR: %[[ISUNINIT:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i +// CIR: cir.if %[[ISUNINIT]] { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i +// CIR: cir.store %[[ONE]], %[[GET_GUARD]] : !s64i, !cir.ptr // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templI8CtorDtorE : !cir.ptr // CIR: %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef}) // CIR: cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]]) : (!cir.ptr {{.*}}, !s32i {llvm.noundef}) -> () @@ -73,9 +95,15 @@ thread_local T tls_templ = {get_i()}; // CIR: %[[GLOB_DECAY:.*]] = cir.cast bitcast %[[GET_GLOB:.*]] : !cir.ptr -> !cir.ptr // CIR: %[[DSO_HANDLE:.*]] = cir.get_global @__dso_handle : !cir.ptr // CIR: cir.call @__cxa_thread_atexit(%[[DTOR_FPTR]], %[[GLOB_DECAY]], %[[DSO_HANDLE]]) : (!cir.ptr)>>, !cir.ptr, !cir.ptr) -> () +// CIR: } +// CIR: cir.return +// CIR: } // FIXME: These have inconsistent COMDAT with classic codegen, but we don't // currently specify 'comdat' with a name. +// Guards: +// LLVM-BOTH-DAG: @_ZGV9tls_templI8CtorDtorE = linkonce_odr thread_local global i64 0, comdat{{.*}}, align 8 +// LLVM-BOTH-DAG: @_ZGV9tls_templIiE = linkonce_odr thread_local global i64 0, comdat{{.*}}, align 8 // Globals: // LLVM-BOTH-DAG: @_Z9tls_templIiE = linkonce_odr thread_local global i32 0, comdat, align 4 // LLVM-BOTH-DAG: @_Z9tls_templI8CtorDtorE = linkonce_odr thread_local global %struct.CtorDtor zeroinitializer, comdat, align 4 @@ -121,10 +149,13 @@ thread_local T tls_templ = {get_i()}; // but ALWAYS treats the load/stores as i8. This is likely a 'bug' in OGCG, but one that // doesn't really matter at all. // LLVM-BOTH: define internal void @[[INT_INIT]]() +// LLVM: %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @_ZGV9tls_templIiE) +// LLVM: %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 8 // OGCG: %[[LOAD_GUARD:.*]] = load i8, ptr @_ZGV9tls_templIiE, align 8 -// OGCG: %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0 -// OGCG: br i1 %[[ISUNINIT]] +// LLVM-BOTH: %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0 +// LLVM-BOTH: br i1 %[[ISUNINIT]] // +// LLVM: store i64 1, ptr %[[GET_GUARD]], align 8 // OGCG: store i8 1, ptr @_ZGV9tls_templIiE, align 8 // LLVM: %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templIiE) // LLVM: %[[CALL:.*]] = call noundef i32 @_Z5get_iv() @@ -133,10 +164,13 @@ thread_local T tls_templ = {get_i()}; // LLVM-BOTH: store i32 %[[CALL]], ptr %[[GET_GLOB]] // LLVM-BOTH: define internal void @[[CTOR_DTOR_INIT]]() +// LLVM: %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @_ZGV9tls_templI8CtorDtorE) +// LLVM: %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 8 // OGCG: %[[LOAD_GUARD:.*]] = load i8, ptr @_ZGV9tls_templI8CtorDtorE, align 8 -// OGCG: %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0 -// OGCG: br i1 %[[ISUNINIT]] +// LLVM-BOTH: %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0 +// LLVM-BOTH: br i1 %[[ISUNINIT]] // +// LLVM: store i64 1, ptr %[[GET_GUARD]], align 8 // OGCG: store i8 1, ptr @_ZGV9tls_templI8CtorDtorE, align 8 // // LLVM: %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templI8CtorDtorE) diff --git a/clang/test/CIR/CodeGen/temporary-materialization-adjust.cpp b/clang/test/CIR/CodeGen/temporary-materialization-adjust.cpp new file mode 100644 index 0000000000000..b4761f56300b4 --- /dev/null +++ b/clang/test/CIR/CodeGen/temporary-materialization-adjust.cpp @@ -0,0 +1,61 @@ +// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR +// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM +// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s --check-prefix=LLVM + +struct Base { int x; }; + +void Field() { + const int &r = Base().x; +} +// CIR-LABEL: cir.func {{.*}}@_Z5Fieldv() +// CIR: %[[TEMP_ALLOCA:.*]] = cir.alloca !rec_Base, !cir.ptr, ["ref.tmp0"] +// CIR: %[[R_ALLOCA:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["r", init, const] +// CIR: %[[GET_MEM:.*]] = cir.get_member %[[TEMP_ALLOCA]][0] {name = "x"} : !cir.ptr -> !cir.ptr +// CIR: cir.store align(8) %[[GET_MEM]], %[[R_ALLOCA]] : !cir.ptr, !cir.ptr> + +// LLVM-LABEL: define {{.*}}@_Z5Fieldv() +// LLVM-DAG: %[[TEMP_ALLOCA:.*]] = alloca %struct.Base +// LLVM-DAG: %[[R_ALLOCA:.*]] = alloca ptr +// LLVM: %[[GET_MEM:.*]] = getelementptr inbounds nuw %struct.Base, ptr %[[TEMP_ALLOCA]], i32 0, i32 0 +// LLVM: store ptr %[[GET_MEM]], ptr %[[R_ALLOCA]], align 8 + +void MemPtr(int Base::*mp) { + const int &r = Base().*mp; +} +// CIR-LABEL: cir.func {{.*}}@_Z6MemPtrM4Basei +// CIR: %[[MP_ALLOCA:.*]] = cir.alloca !s64i, !cir.ptr, ["mp", init] {alignment = 8 : i64} +// CIR: %[[TEMP_ALLOCA:.*]] = cir.alloca !rec_Base, !cir.ptr, ["ref.tmp0"] {alignment = 4 : i64} +// CIR: %[[R_ALLOCA:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["r", init, const] {alignment = 8 : i64} +// CIR: %[[ARG_LOAD:.*]] = cir.load align(8) %[[MP_ALLOCA]] : !cir.ptr, !s64i +// CIR: %[[TEMP_LOAD:.*]] = cir.cast bitcast %[[TEMP_ALLOCA]] : !cir.ptr -> !cir.ptr +// CIR: %[[STRIDE:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ARG_LOAD]] : (!cir.ptr, !s64i) -> !cir.ptr +// CIR: %[[TO_INT:.*]] = cir.cast bitcast %[[STRIDE:.*]] : !cir.ptr -> !cir.ptr +// CIR: cir.store align(8) %[[TO_INT]], %[[R_ALLOCA]] : !cir.ptr, !cir.ptr> + +// LLVM-LABEL: define {{.*}}@_Z6MemPtrM4Basei +// LLVM: %[[MP_ALLOCA:.*]] = alloca i64 +// LLVM-DAG: %[[TEMP_ALLOCA:.*]] = alloca %struct.Base +// LLVM-DAG: %[[R_ALLOCA:.*]] = alloca ptr +// LLVM: %[[ARG_LOAD:.*]] = load i64, ptr %[[MP_ALLOCA]], align 8 +// LLVM: %[[STRIDE:.*]] = getelementptr {{.*}}i8, ptr %[[TEMP_ALLOCA]], i64 %[[ARG_LOAD]] +// LLVM: store ptr %[[STRIDE]], ptr %[[R_ALLOCA]], align 8 + +struct Derived : Base {}; +void DerivedToBase() { + const int &r = Derived().x; +} +// CIR-LABEL: cir.func {{.*}}@_Z13DerivedToBasev() +// CIR: %[[TEMP_ALLOCA:.*]] = cir.alloca !rec_Derived, !cir.ptr, ["ref.tmp0"] {alignment = 4 : i64} +// CIR: %[[R_ALLOCA:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["r", init, const] {alignment = 8 : i64} +// CIR: %[[BASE:.*]] = cir.base_class_addr %[[TEMP_ALLOCA]] : !cir.ptr nonnull [0] -> !cir.ptr +// CIR: %[[GET_MEM:.*]] = cir.get_member %[[BASE]][0] {name = "x"} : !cir.ptr -> !cir.ptr +// CIR: cir.store align(8) %[[GET_MEM]], %[[R_ALLOCA]] : !cir.ptr, !cir.ptr> + +// LLVM-LABEL: define {{.*}}@_Z13DerivedToBasev +// LLVM-DAG: %[[TEMP_ALLOCA:.*]] = alloca %struct.Derived +// LLVM-DAG: %[[R_ALLOCA:.*]] = alloca ptr +// LLVM: %[[GET_MEM:.*]] = getelementptr inbounds nuw %struct.Base, ptr %[[TEMP_ALLOCA]], i32 0, i32 0 +// LLVM: store ptr %[[GET_MEM]], ptr %[[R_ALLOCA]], align 8 diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c index e386d2cca2cb1..40635342b8949 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c @@ -611,46 +611,6 @@ mfloat8x16_t test_vdupq_laneq_mf8(mfloat8x16_t a) { return vdupq_laneq_mf8(a, 7); } -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_mf8( -// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -mfloat8x8_t test_vtrn1_mf8(mfloat8x8_t a, mfloat8x8_t b) { - return vtrn1_mf8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_mf8( -// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -mfloat8x16_t test_vtrn1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { - return vtrn1q_mf8(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_mf8( -// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -mfloat8x8_t test_vtrn2_mf8(mfloat8x8_t a, mfloat8x8_t b) { - return vtrn2_mf8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_mf8( -// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -mfloat8x16_t test_vtrn2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { - return vtrn2q_mf8(a, b); -} - // CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl1_mf8( // CHECK-SAME: <16 x i8> [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon-perm.c b/clang/test/CodeGen/AArch64/neon-perm.c index 79cf97f10ae40..df8b526e47a1a 100644 --- a/clang/test/CodeGen/AArch64/neon-perm.c +++ b/clang/test/CodeGen/AArch64/neon-perm.c @@ -6,428 +6,8 @@ #include -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) { - return vtrn1_s8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_s8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) { - return vtrn1q_s8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) { - return vtrn1_s16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_s16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) { - return vtrn1q_s16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) { - return vtrn1_s32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_s32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) { - return vtrn1q_s32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_s64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) { - return vtrn1q_s64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) { - return vtrn1_u8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_u8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) { - return vtrn1q_u8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) { - return vtrn1_u16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_u16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) { - return vtrn1q_u16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) { - return vtrn1_u32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_u32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) { - return vtrn1q_u32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_u64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) { - return vtrn1q_u64(a, b); -} - -// CHECK-LABEL: define dso_local <2 x float> @test_vtrn1_f32( -// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] -// -float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) { - return vtrn1_f32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x float> @test_vtrn1q_f32( -// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] -// -float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) { - return vtrn1q_f32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x double> @test_vtrn1q_f64( -// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] -// -float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) { - return vtrn1q_f64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_p8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) { - return vtrn1_p8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_p8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) { - return vtrn1q_p8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_p16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) { - return vtrn1_p16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_p16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) { - return vtrn1q_p16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) { - return vtrn2_s8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_s8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) { - return vtrn2q_s8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_s16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) { - return vtrn2_s16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_s16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) { - return vtrn2q_s16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_s32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) { - return vtrn2_s32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_s32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) { - return vtrn2q_s32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_s64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) { - return vtrn2q_s64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_u8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) { - return vtrn2_u8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_u8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) { - return vtrn2q_u8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_u16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) { - return vtrn2_u16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_u16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) { - return vtrn2q_u16(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_u32( -// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] -// -uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) { - return vtrn2_u32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_u32( -// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] -// -uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) { - return vtrn2q_u32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_u64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) { - return vtrn2q_u64(a, b); -} - -// CHECK-LABEL: define dso_local <2 x float> @test_vtrn2_f32( -// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] -// -float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) { - return vtrn2_f32(a, b); -} - -// CHECK-LABEL: define dso_local <4 x float> @test_vtrn2q_f32( -// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] -// -float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) { - return vtrn2q_f32(a, b); -} - -// CHECK-LABEL: define dso_local <2 x double> @test_vtrn2q_f64( -// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] -// -float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) { - return vtrn2q_f64(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_p8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] -// -poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) { - return vtrn2_p8(a, b); -} - -// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_p8( -// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> -// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] -// -poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) { - return vtrn2q_p8(a, b); -} - -// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_p16( -// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> -// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] -// -poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) { - return vtrn2_p16(a, b); -} - -// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_p16( -// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> -// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] -// -poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) { - return vtrn2q_p16(a, b); -} - // CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vtrn_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> // CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> diff --git a/clang/test/CodeGen/AArch64/neon/perm.c b/clang/test/CodeGen/AArch64/neon/perm.c index c90eb8290db55..419769ae3f0fa 100644 --- a/clang/test/CodeGen/AArch64/neon/perm.c +++ b/clang/test/CodeGen/AArch64/neon/perm.c @@ -1830,3 +1830,536 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { // LLVM: ret %struct.uint16x8x2_t return vuzpq_u16(a, b); } + +//===------------------------------------------------------===// +// 2.1.9.12. Transpose elements +// https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#transpose-elements +//===------------------------------------------------------===// + +// LLVM-LABEL: @test_vtrn1_s8( +// CIR-LABEL: @vtrn1_s8( +int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !s8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn1_s8(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_s8( +// CIR-LABEL: @vtrn1q_s8( +int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !s8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn1q_s8(a, b); +} + +// LLVM-LABEL: @test_vtrn1_s16( +// CIR-LABEL: @vtrn1_s16( +int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s16i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !s16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[SHUFFLE]] + return vtrn1_s16(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_s16( +// CIR-LABEL: @vtrn1q_s16( +int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !s16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[SHUFFLE]] + return vtrn1q_s16(a, b); +} + +// LLVM-LABEL: @test_vtrn1_s32( +// CIR-LABEL: @vtrn1_s32( +int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !s32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[SHUFFLE]] + return vtrn1_s32(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_s32( +// CIR-LABEL: @vtrn1q_s32( +int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !s32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[SHUFFLE]] + return vtrn1q_s32(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_s64( +// CIR-LABEL: @vtrn1q_s64( +int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !s64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[SHUFFLE]] + return vtrn1q_s64(a, b); +} + +// LLVM-LABEL: @test_vtrn1_u8( +// CIR-LABEL: @vtrn1_u8( +uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn1_u8(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_u8( +// CIR-LABEL: @vtrn1q_u8( +uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn1q_u8(a, b); +} + +// LLVM-LABEL: @test_vtrn1_u16( +// CIR-LABEL: @vtrn1_u16( +uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[SHUFFLE]] + return vtrn1_u16(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_u16( +// CIR-LABEL: @vtrn1q_u16( +uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[SHUFFLE]] + return vtrn1q_u16(a, b); +} + +// LLVM-LABEL: @test_vtrn1_u32( +// CIR-LABEL: @vtrn1_u32( +uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[SHUFFLE]] + return vtrn1_u32(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_u32( +// CIR-LABEL: @vtrn1q_u32( +uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u32i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[SHUFFLE]] + return vtrn1q_u32(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_u64( +// CIR-LABEL: @vtrn1q_u64( +uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[SHUFFLE]] + return vtrn1q_u64(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_p64( +// CIR-LABEL: @vtrn1q_p64( +poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[SHUFFLE]] + return vtrn1q_p64(a, b); +} + +// LLVM-LABEL: @test_vtrn1_f32( +// CIR-LABEL: @vtrn1_f32( +float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !cir.float> + +// LLVM-SAME: <2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// LLVM: ret <2 x float> [[SHUFFLE]] + return vtrn1_f32(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_f32( +// CIR-LABEL: @vtrn1q_f32( +float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !cir.float> + +// LLVM-SAME: <4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// LLVM: ret <4 x float> [[SHUFFLE]] + return vtrn1q_f32(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_f64( +// CIR-LABEL: @vtrn1q_f64( +float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !cir.double> + +// LLVM-SAME: <2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// LLVM: ret <2 x double> [[SHUFFLE]] + return vtrn1q_f64(a, b); +} + +// LLVM-LABEL: @test_vtrn1_p8( +// CIR-LABEL: @vtrn1_p8( +poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn1_p8(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_p8( +// CIR-LABEL: @vtrn1q_p8( +poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn1q_p8(a, b); +} + +// LLVM-LABEL: @test_vtrn1_p16( +// CIR-LABEL: @vtrn1_p16( +poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[SHUFFLE]] + return vtrn1_p16(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_p16( +// CIR-LABEL: @vtrn1q_p16( +poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[SHUFFLE]] + return vtrn1q_p16(a, b); +} + +// LLVM-LABEL: @test_vtrn1_mf8( +// CIR-LABEL: @vtrn1_mf8( +mfloat8x8_t test_vtrn1_mf8(mfloat8x8_t a, mfloat8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn1_mf8(a, b); +} + +// LLVM-LABEL: @test_vtrn1q_mf8( +// CIR-LABEL: @vtrn1q_mf8( +mfloat8x16_t test_vtrn1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn1q_mf8(a, b); +} + +// LLVM-LABEL: @test_vtrn2_s8( +// CIR-LABEL: @vtrn2_s8( +int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !s8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn2_s8(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_s8( +// CIR-LABEL: @vtrn2q_s8( +int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !s8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn2q_s8(a, b); +} + +// LLVM-LABEL: @test_vtrn2_s16( +// CIR-LABEL: @vtrn2_s16( +int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s16i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !s16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[SHUFFLE]] + return vtrn2_s16(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_s16( +// CIR-LABEL: @vtrn2q_s16( +int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !s16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[SHUFFLE]] + return vtrn2q_s16(a, b); +} + +// LLVM-LABEL: @test_vtrn2_s32( +// CIR-LABEL: @vtrn2_s32( +int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !s32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[SHUFFLE]] + return vtrn2_s32(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_s32( +// CIR-LABEL: @vtrn2q_s32( +int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !s32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[SHUFFLE]] + return vtrn2q_s32(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_s64( +// CIR-LABEL: @vtrn2q_s64( +int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !s64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[SHUFFLE]] + return vtrn2q_s64(a, b); +} + +// LLVM-LABEL: @test_vtrn2_u8( +// CIR-LABEL: @vtrn2_u8( +uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn2_u8(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_u8( +// CIR-LABEL: @vtrn2q_u8( +uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn2q_u8(a, b); +} + +// LLVM-LABEL: @test_vtrn2_u16( +// CIR-LABEL: @vtrn2_u16( +uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[SHUFFLE]] + return vtrn2_u16(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_u16( +// CIR-LABEL: @vtrn2q_u16( +uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[SHUFFLE]] + return vtrn2q_u16(a, b); +} + +// LLVM-LABEL: @test_vtrn2_u32( +// CIR-LABEL: @vtrn2_u32( +uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u32i> + +// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// LLVM: ret <2 x i32> [[SHUFFLE]] + return vtrn2_u32(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_u32( +// CIR-LABEL: @vtrn2q_u32( +uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u32i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u32i> + +// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// LLVM: ret <4 x i32> [[SHUFFLE]] + return vtrn2q_u32(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_u64( +// CIR-LABEL: @vtrn2q_u64( +uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[SHUFFLE]] + return vtrn2q_u64(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_p64( +// CIR-LABEL: @vtrn2q_p64( +poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u64i> + +// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// LLVM: ret <2 x i64> [[SHUFFLE]] + return vtrn2q_p64(a, b); +} + +// LLVM-LABEL: @test_vtrn2_f32( +// CIR-LABEL: @vtrn2_f32( +float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !cir.float> + +// LLVM-SAME: <2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// LLVM: ret <2 x float> [[SHUFFLE]] + return vtrn2_f32(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_f32( +// CIR-LABEL: @vtrn2q_f32( +float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !cir.float> + +// LLVM-SAME: <4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// LLVM: ret <4 x float> [[SHUFFLE]] + return vtrn2q_f32(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_f64( +// CIR-LABEL: @vtrn2q_f64( +float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !cir.double> + +// LLVM-SAME: <2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// LLVM: ret <2 x double> [[SHUFFLE]] + return vtrn2q_f64(a, b); +} + +// LLVM-LABEL: @test_vtrn2_p8( +// CIR-LABEL: @vtrn2_p8( +poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn2_p8(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_p8( +// CIR-LABEL: @vtrn2q_p8( +poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn2q_p8(a, b); +} + +// LLVM-LABEL: @test_vtrn2_p16( +// CIR-LABEL: @vtrn2_p16( +poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u16i> + +// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// LLVM: ret <4 x i16> [[SHUFFLE]] + return vtrn2_p16(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_p16( +// CIR-LABEL: @vtrn2q_p16( +poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u16i> + +// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// LLVM: ret <8 x i16> [[SHUFFLE]] + return vtrn2q_p16(a, b); +} + +// LLVM-LABEL: @test_vtrn2_mf8( +// CIR-LABEL: @vtrn2_mf8( +mfloat8x8_t test_vtrn2_mf8(mfloat8x8_t a, mfloat8x8_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i> + +// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// LLVM: ret <8 x i8> [[SHUFFLE]] + return vtrn2_mf8(a, b); +} + +// LLVM-LABEL: @test_vtrn2q_mf8( +// CIR-LABEL: @vtrn2q_mf8( +mfloat8x16_t test_vtrn2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { +// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i> + +// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// LLVM: ret <16 x i8> [[SHUFFLE]] + return vtrn2q_mf8(a, b); +} diff --git a/clang/test/CodeGen/AArch64/poly64.c b/clang/test/CodeGen/AArch64/poly64.c index 1a7eceefa6a58..50617f531e6a1 100644 --- a/clang/test/CodeGen/AArch64/poly64.c +++ b/clang/test/CodeGen/AArch64/poly64.c @@ -515,26 +515,6 @@ poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) { return vextq_p64(a, b, 1); } -// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_p64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) { - return vtrn1q_p64(a, b); -} - -// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_p64( -// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] -// -poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) { - return vtrn2q_u64(a, b); -} - // CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_p64( // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl index 90d51c716c771..0fa798a16b805 100644 --- a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl +++ b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl @@ -11,7 +11,7 @@ // CHECK-SPIRV: define hidden spir_func void @ void test_AllMemoryBarrier() { // CHECK-DXIL: call void @llvm.[[TARGET]].all.memory.barrier() -// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].all.memory.barrier() +// CHECK-SPIRV: call void @llvm.[[TARGET]].all.memory.barrier() AllMemoryBarrier(); } diff --git a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl index 6ddb69671e094..b4a3371f7628f 100644 --- a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl +++ b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl @@ -11,7 +11,7 @@ // CHECK-SPIRV: define hidden spir_func void @ void test_AllMemoryBarrierWithGroupSync() { // CHECK-DXIL: call void @llvm.[[TARGET]].all.memory.barrier.with.group.sync() -// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].all.memory.barrier.with.group.sync() +// CHECK-SPIRV: call void @llvm.[[TARGET]].all.memory.barrier.with.group.sync() AllMemoryBarrierWithGroupSync(); } diff --git a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl index e2c08f7775c8c..d9613aedc1cc6 100644 --- a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl +++ b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl @@ -11,7 +11,7 @@ // CHECK-SPIRV: define hidden spir_func void @ void test_DeviceMemoryBarrier() { // CHECK-DXIL: call void @llvm.[[TARGET]].device.memory.barrier() -// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].device.memory.barrier() +// CHECK-SPIRV: call void @llvm.[[TARGET]].device.memory.barrier() DeviceMemoryBarrier(); } diff --git a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl index fa455f5f8338b..bea7d7391aec2 100644 --- a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl +++ b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl @@ -11,7 +11,7 @@ // CHECK-SPIRV: define hidden spir_func void @ void test_DeviceMemoryBarrierWithGroupSync() { // CHECK-DXIL: call void @llvm.[[TARGET]].device.memory.barrier.with.group.sync() -// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].device.memory.barrier.with.group.sync() +// CHECK-SPIRV: call void @llvm.[[TARGET]].device.memory.barrier.with.group.sync() DeviceMemoryBarrierWithGroupSync(); } diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl index b52819973f677..d33baeac940b6 100644 --- a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl +++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl @@ -11,7 +11,7 @@ // CHECK-SPIRV: define hidden spir_func void @ void test_GroupMemoryBarrier() { // CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier() -// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier() +// CHECK-SPIRV: call void @llvm.[[TARGET]].group.memory.barrier() GroupMemoryBarrier(); } diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl index e709ed3616f0d..b69f67cb8dfaa 100644 --- a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl +++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl @@ -11,7 +11,7 @@ // CHECK-SPIRV: define hidden spir_func void @ void test_GroupMemoryBarrierWithGroupSync() { // CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync() -// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier.with.group.sync() +// CHECK-SPIRV: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync() GroupMemoryBarrierWithGroupSync(); } diff --git a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl index 54dd82b9fd485..f6bf05e524964 100644 --- a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl +++ b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl @@ -1,169 +1,169 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=dx -DCC="" +// RUN: --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=dx -DCC="" +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=dx // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=spv -DCC="spir_func " +// RUN: --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=spv // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=spv -DCC="spir_func " +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=spv -// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]]) +// CHECK: %[[RET:.*]] = call i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]]) // CHECK: ret i32 %[[RET]] int test_int(int expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]]) // CHECK: ret <2 x i32> %[[RET]] int2 test_int2(int2 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]]) // CHECK: ret <3 x i32> %[[RET]] int3 test_int3(int3 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]]) // CHECK: ret <4 x i32> %[[RET]] int4 test_int4(int4 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]]) +// CHECK: %[[RET:.*]] = call i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]]) // CHECK: ret i32 %[[RET]] uint test_uint(uint expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]]) // CHECK: ret <2 x i32> %[[RET]] uint2 test_uint2(uint2 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]]) // CHECK: ret <3 x i32> %[[RET]] uint3 test_uint3(uint3 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]]) // CHECK: ret <4 x i32> %[[RET]] uint4 test_uint4(uint4 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]]) +// CHECK: %[[RET:.*]] = call i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]]) // CHECK: ret i64 %[[RET]] int64_t test_int64_t(int64_t expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]]) // CHECK: ret <2 x i64> %[[RET]] int64_t2 test_int64_t2(int64_t2 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]]) // CHECK: ret <3 x i64> %[[RET]] int64_t3 test_int64_t3(int64_t3 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]]) // CHECK: ret <4 x i64> %[[RET]] int64_t4 test_int64_t4(int64_t4 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]]) +// CHECK: %[[RET:.*]] = call i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]]) // CHECK: ret i64 %[[RET]] uint64_t test_uint64_t(uint64_t expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]]) // CHECK: ret <2 x i64> %[[RET]] uint64_t2 test_uint64_t2(uint64_t2 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]]) // CHECK: ret <3 x i64> %[[RET]] uint64_t3 test_uint64_t3(uint64_t3 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]]) // CHECK: ret <4 x i64> %[[RET]] uint64_t4 test_uint64_t4(uint64_t4 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]]) // CHECK: ret float %[[RET]] float test_float(float expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]]) // CHECK: ret <2 x float> %[[RET]] float2 test_float2(float2 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]]) // CHECK: ret <3 x float> %[[RET]] float3 test_float3(float3 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]]) // CHECK: ret <4 x float> %[[RET]] float4 test_float4(float4 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]double @llvm.[[TARGET]].quad.read.across.x.f64(double %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn double @llvm.[[TARGET]].quad.read.across.x.f64(double %[[#]]) // CHECK: ret double %[[RET]] double test_double(double expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x double> @llvm.[[TARGET]].quad.read.across.x.v2f64(<2 x double> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.[[TARGET]].quad.read.across.x.v2f64(<2 x double> %[[#]]) // CHECK: ret <2 x double> %[[RET]] double2 test_double2(double2 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x double> @llvm.[[TARGET]].quad.read.across.x.v3f64(<3 x double> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.[[TARGET]].quad.read.across.x.v3f64(<3 x double> %[[#]]) // CHECK: ret <3 x double> %[[RET]] double3 test_double3(double3 expr) { return QuadReadAcrossX(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x double> @llvm.[[TARGET]].quad.read.across.x.v4f64(<4 x double> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.[[TARGET]].quad.read.across.x.v4f64(<4 x double> %[[#]]) // CHECK: ret <4 x double> %[[RET]] double4 test_double4(double4 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]half @llvm.[[TARGET]].quad.read.across.x.f16(half %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].quad.read.across.x.f16(half %[[#]]) // CHECK-NATIVE_HALF: ret half %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]]) // CHECK-NO_HALF: ret float %[[RET]] half test_half(half expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x half> @llvm.[[TARGET]].quad.read.across.x.v2f16(<2 x half> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.[[TARGET]].quad.read.across.x.v2f16(<2 x half> %[[#]]) // CHECK-NATIVE_HALF: ret <2 x half> %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]]) // CHECK-NO_HALF: ret <2 x float> %[[RET]] half2 test_half2(half2 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x half> @llvm.[[TARGET]].quad.read.across.x.v3f16(<3 x half> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.[[TARGET]].quad.read.across.x.v3f16(<3 x half> %[[#]]) // CHECK-NATIVE_HALF: ret <3 x half> %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]]) // CHECK-NO_HALF: ret <3 x float> %[[RET]] half3 test_half3(half3 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x half> @llvm.[[TARGET]].quad.read.across.x.v4f16(<4 x half> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.[[TARGET]].quad.read.across.x.v4f16(<4 x half> %[[#]]) // CHECK-NATIVE_HALF: ret <4 x half> %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]]) // CHECK-NO_HALF: ret <4 x float> %[[RET]] half4 test_half4(half4 expr) { return QuadReadAcrossX(expr); } #ifdef __HLSL_ENABLE_16_BIT -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]]) // CHECK-NATIVE_HALF: ret i16 %[[RET]] int16_t test_int16_t(int16_t expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]] int16_t2 test_int16_t2(int16_t2 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]] int16_t3 test_int16_t3(int16_t3 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]] int16_t4 test_int16_t4(int16_t4 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]]) // CHECK-NATIVE_HALF: ret i16 %[[RET]] uint16_t test_uint16_t(uint16_t expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]] uint16_t2 test_uint16_t2(uint16_t2 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]] uint16_t3 test_uint16_t3(uint16_t3 expr) { return QuadReadAcrossX(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]] uint16_t4 test_uint16_t4(uint16_t4 expr) { return QuadReadAcrossX(expr); } #endif diff --git a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl index 313c287dc1a7d..9d70545f90a28 100644 --- a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl +++ b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl @@ -15,157 +15,157 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV,CHECK-NO_HALF // Capture the expected interchange format so not every check needs to be duplicated -// CHECK-DXIL: %[[RET:.*]] = call [[CC:]]i32 @llvm.[[ICF:dx]].quad.read.across.y.i32(i32 %[[#]]) -// CHECK-SPIRV: %[[RET:.*]] = call [[CC:spir_func ]]i32 @llvm.[[ICF:spv]].quad.read.across.y.i32(i32 %[[#]]) +// CHECK-DXIL: %[[RET:.*]] = call i32 @llvm.[[ICF:dx]].quad.read.across.y.i32(i32 %[[#]]) +// CHECK-SPIRV: %[[RET:.*]] = call i32 @llvm.[[ICF:spv]].quad.read.across.y.i32(i32 %[[#]]) // CHECK: ret i32 %[[RET]] int test_int(int expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]]) // CHECK: ret <2 x i32> %[[RET]] int2 test_int2(int2 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]]) // CHECK: ret <3 x i32> %[[RET]] int3 test_int3(int3 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]]) // CHECK: ret <4 x i32> %[[RET]] int4 test_int4(int4 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[ICF]].quad.read.across.y.i32(i32 %[[#]]) +// CHECK: %[[RET:.*]] = call i32 @llvm.[[ICF]].quad.read.across.y.i32(i32 %[[#]]) // CHECK: ret i32 %[[RET]] uint test_uint(uint expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]]) // CHECK: ret <2 x i32> %[[RET]] uint2 test_uint2(uint2 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]]) // CHECK: ret <3 x i32> %[[RET]] uint3 test_uint3(uint3 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]]) // CHECK: ret <4 x i32> %[[RET]] uint4 test_uint4(uint4 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]]) +// CHECK: %[[RET:.*]] = call i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]]) // CHECK: ret i64 %[[RET]] int64_t test_int64_t(int64_t expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]]) // CHECK: ret <2 x i64> %[[RET]] int64_t2 test_int64_t2(int64_t2 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]]) // CHECK: ret <3 x i64> %[[RET]] int64_t3 test_int64_t3(int64_t3 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]]) // CHECK: ret <4 x i64> %[[RET]] int64_t4 test_int64_t4(int64_t4 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]]) +// CHECK: %[[RET:.*]] = call i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]]) // CHECK: ret i64 %[[RET]] uint64_t test_uint64_t(uint64_t expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]]) // CHECK: ret <2 x i64> %[[RET]] uint64_t2 test_uint64_t2(uint64_t2 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]]) // CHECK: ret <3 x i64> %[[RET]] uint64_t3 test_uint64_t3(uint64_t3 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]]) +// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]]) // CHECK: ret <4 x i64> %[[RET]] uint64_t4 test_uint64_t4(uint64_t4 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]]) // CHECK: ret float %[[RET]] float test_float(float expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]]) // CHECK: ret <2 x float> %[[RET]] float2 test_float2(float2 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]]) // CHECK: ret <3 x float> %[[RET]] float3 test_float3(float3 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]]) // CHECK: ret <4 x float> %[[RET]] float4 test_float4(float4 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]double @llvm.[[ICF]].quad.read.across.y.f64(double %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn double @llvm.[[ICF]].quad.read.across.y.f64(double %[[#]]) // CHECK: ret double %[[RET]] double test_double(double expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x double> @llvm.[[ICF]].quad.read.across.y.v2f64(<2 x double> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.[[ICF]].quad.read.across.y.v2f64(<2 x double> %[[#]]) // CHECK: ret <2 x double> %[[RET]] double2 test_double2(double2 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x double> @llvm.[[ICF]].quad.read.across.y.v3f64(<3 x double> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.[[ICF]].quad.read.across.y.v3f64(<3 x double> %[[#]]) // CHECK: ret <3 x double> %[[RET]] double3 test_double3(double3 expr) { return QuadReadAcrossY(expr); } -// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x double> @llvm.[[ICF]].quad.read.across.y.v4f64(<4 x double> %[[#]]) +// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.[[ICF]].quad.read.across.y.v4f64(<4 x double> %[[#]]) // CHECK: ret <4 x double> %[[RET]] double4 test_double4(double4 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]half @llvm.[[ICF]].quad.read.across.y.f16(half %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.[[ICF]].quad.read.across.y.f16(half %[[#]]) // CHECK-NATIVE_HALF: ret half %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]]) // CHECK-NO_HALF: ret float %[[RET]] half test_half(half expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x half> @llvm.[[ICF]].quad.read.across.y.v2f16(<2 x half> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.[[ICF]].quad.read.across.y.v2f16(<2 x half> %[[#]]) // CHECK-NATIVE_HALF: ret <2 x half> %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]]) // CHECK-NO_HALF: ret <2 x float> %[[RET]] half2 test_half2(half2 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x half> @llvm.[[ICF]].quad.read.across.y.v3f16(<3 x half> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.[[ICF]].quad.read.across.y.v3f16(<3 x half> %[[#]]) // CHECK-NATIVE_HALF: ret <3 x half> %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]]) // CHECK-NO_HALF: ret <3 x float> %[[RET]] half3 test_half3(half3 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x half> @llvm.[[ICF]].quad.read.across.y.v4f16(<4 x half> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.[[ICF]].quad.read.across.y.v4f16(<4 x half> %[[#]]) // CHECK-NATIVE_HALF: ret <4 x half> %[[RET]] -// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]]) +// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]]) // CHECK-NO_HALF: ret <4 x float> %[[RET]] half4 test_half4(half4 expr) { return QuadReadAcrossY(expr); } #ifdef __HLSL_ENABLE_16_BIT -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]]) // CHECK-NATIVE_HALF: ret i16 %[[RET]] int16_t test_int16_t(int16_t expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]] int16_t2 test_int16_t2(int16_t2 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]] int16_t3 test_int16_t3(int16_t3 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]] int16_t4 test_int16_t4(int16_t4 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]]) // CHECK-NATIVE_HALF: ret i16 %[[RET]] uint16_t test_uint16_t(uint16_t expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]] uint16_t2 test_uint16_t2(uint16_t2 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]] uint16_t3 test_uint16_t3(uint16_t3 expr) { return QuadReadAcrossY(expr); } -// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]]) +// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]]) // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]] uint16_t4 test_uint16_t4(uint16_t4 expr) { return QuadReadAcrossY(expr); } #endif diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl index 323aa439984f9..f8bcdfdb3333f 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl @@ -9,7 +9,7 @@ // CHECK-LABEL: test_int bool test_int(int expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.all.equal.i32(i32 + // CHECK-SPIRV: %[[RET:.*]] = call i1 @llvm.spv.wave.all.equal.i32(i32 // CHECK-DXIL: %[[RET:.*]] = call i1 @llvm.dx.wave.all.equal.i32(i32 // CHECK: ret i1 %[[RET]] return WaveActiveAllEqual(expr); @@ -20,7 +20,7 @@ bool test_int(int expr) { // CHECK-LABEL: test_uint64_t bool test_uint64_t(uint64_t expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.all.equal.i64(i64 + // CHECK-SPIRV: %[[RET:.*]] = call i1 @llvm.spv.wave.all.equal.i64(i64 // CHECK-DXIL: %[[RET:.*]] = call i1 @llvm.dx.wave.all.equal.i64(i64 // CHECK: ret i1 %[[RET]] return WaveActiveAllEqual(expr); @@ -33,7 +33,7 @@ bool test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_floatv4 bool4 test_floatv4(float4 expr) { - // CHECK-SPIRV: %[[RET1:.*]] = call spir_func <4 x i1> @llvm.spv.wave.all.equal.v4f32(<4 x float> + // CHECK-SPIRV: %[[RET1:.*]] = call <4 x i1> @llvm.spv.wave.all.equal.v4f32(<4 x float> // CHECK-DXIL: %[[RET1:.*]] = call <4 x i1> @llvm.dx.wave.all.equal.v4f32(<4 x float> // CHECK: ret <4 x i1> %[[RET1]] return WaveActiveAllEqual(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl index f499fc97f43fc..94060ceb97e66 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl @@ -10,7 +10,7 @@ // CHECK-LABEL: define {{.*}}test bool test(bool p1) { // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.all(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call i1 @llvm.spv.wave.all(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ] // CHECK-DXIL: %[[RET:.*]] = call i1 @llvm.dx.wave.all(i1 %{{[a-zA-Z0-9]+}}) // CHECK: ret i1 %[[RET]] return WaveActiveAllTrue(p1); diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl index 3655cdb443fa9..c4b8239448f2c 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl @@ -10,7 +10,7 @@ // CHECK-LABEL: define {{.*}}test bool test(bool p1) { // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.any(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call i1 @llvm.spv.wave.any(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ] // CHECK-DXIL: %[[RET:.*]] = call i1 @llvm.dx.wave.any(i1 %{{[a-zA-Z0-9]+}}) // CHECK: ret i1 %[[RET]] return WaveActiveAnyTrue(p1); diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl index df2d854a64247..4c7d5cd2a1c4a 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl @@ -10,7 +10,7 @@ // CHECK-LABEL: define {{.*}}test uint4 test(bool p1) { // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[SPIRVRET:.*]] = call spir_func <4 x i32> @llvm.spv.subgroup.ballot(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-SPIRV: %[[SPIRVRET:.*]] = call <4 x i32> @llvm.spv.subgroup.ballot(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ] // CHECK-DXIL: %[[WAB:.*]] = call { i32, i32, i32, i32 } @llvm.dx.wave.ballot.i32(i1 %{{[a-zA-Z0-9]+}}) // CHECK-DXIL-NEXT: extractvalue { i32, i32, i32, i32 } {{.*}} 0 // CHECK-DXIL-NEXT: insertelement <4 x i32> poison, i32 {{.*}}, i32 0 diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl index a6da9678d7275..78b3feb5ade66 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl @@ -1,17 +1,17 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ -// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK -DCALL="call" +// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ // RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ -// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK -DCALL="call spir_func" +// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK // Test basic lowering to runtime function call. // CHECK-LABEL: test_uint uint test_uint(uint expr) { - // DXCHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.and.i32([[TY]] %[[#]]) - // SPVCHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.and.i32([[TY]] %[[#]]) + // DXCHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.and.i32([[TY]] %[[#]]) + // SPVCHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.and.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } @@ -20,7 +20,7 @@ uint test_uint(uint expr) { // CHECK-LABEL: test_uint2 uint2 test_uint2(uint2 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } @@ -29,7 +29,7 @@ uint2 test_uint2(uint2 expr) { // CHECK-LABEL: test_uint3 uint3 test_uint3(uint3 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } @@ -38,7 +38,7 @@ uint3 test_uint3(uint3 expr) { // CHECK-LABEL: test_uint4 uint4 test_uint4(uint4 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } @@ -47,7 +47,7 @@ uint4 test_uint4(uint4 expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } @@ -56,7 +56,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_uint64_t2 uint64_t2 test_uint64_t2(uint64_t2 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } @@ -65,7 +65,7 @@ uint64_t2 test_uint64_t2(uint64_t2 expr) { // CHECK-LABEL: test_uint64_t3 uint64_t3 test_uint64_t3(uint64_t3 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } @@ -74,7 +74,7 @@ uint64_t3 test_uint64_t3(uint64_t3 expr) { // CHECK-LABEL: test_uint64_t4 uint64_t4 test_uint64_t4(uint64_t4 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitAnd(expr); } diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl index 80364724448fa..f92dec830256c 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl @@ -1,17 +1,17 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ -// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK -DCALL="call" +// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ // RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ -// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK -DCALL="call spir_func" +// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK // Test basic lowering to runtime function call. // CHECK-LABEL: test_uint uint test_uint(uint expr) { - // DXCHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.or.i32([[TY]] %[[#]]) - // SPVCHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.or.i32([[TY]] %[[#]]) + // DXCHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.or.i32([[TY]] %[[#]]) + // SPVCHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.or.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } @@ -20,7 +20,7 @@ uint test_uint(uint expr) { // CHECK-LABEL: test_uint2 uint2 test_uint2(uint2 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } @@ -29,7 +29,7 @@ uint2 test_uint2(uint2 expr) { // CHECK-LABEL: test_uint3 uint3 test_uint3(uint3 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } @@ -38,7 +38,7 @@ uint3 test_uint3(uint3 expr) { // CHECK-LABEL: test_uint4 uint4 test_uint4(uint4 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } @@ -47,7 +47,7 @@ uint4 test_uint4(uint4 expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } @@ -56,7 +56,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_uint64_t2 uint64_t2 test_uint64_t2(uint64_t2 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } @@ -65,7 +65,7 @@ uint64_t2 test_uint64_t2(uint64_t2 expr) { // CHECK-LABEL: test_uint64_t3 uint64_t3 test_uint64_t3(uint64_t3 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } @@ -74,7 +74,7 @@ uint64_t3 test_uint64_t3(uint64_t3 expr) { // CHECK-LABEL: test_uint64_t4 uint64_t4 test_uint64_t4(uint64_t4 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitOr(expr); } diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl index 9c94663390843..9d04ba92a3242 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl @@ -1,17 +1,17 @@ // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ -// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK -DCALL="call" +// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ // RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ -// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK -DCALL="call spir_func" +// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK // Test basic lowering to runtime function call. // CHECK-LABEL: test_uint uint test_uint(uint expr) { - // DXCHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.xor.i32([[TY]] %[[#]]) - // SPVCHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.xor.i32([[TY]] %[[#]]) + // DXCHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.xor.i32([[TY]] %[[#]]) + // SPVCHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.xor.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } @@ -20,7 +20,7 @@ uint test_uint(uint expr) { // CHECK-LABEL: test_uint2 uint2 test_uint2(uint2 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } @@ -29,7 +29,7 @@ uint2 test_uint2(uint2 expr) { // CHECK-LABEL: test_uint3 uint3 test_uint3(uint3 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } @@ -38,7 +38,7 @@ uint3 test_uint3(uint3 expr) { // CHECK-LABEL: test_uint4 uint4 test_uint4(uint4 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i32([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } @@ -47,7 +47,7 @@ uint4 test_uint4(uint4 expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } @@ -56,7 +56,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_uint64_t2 uint64_t2 test_uint64_t2(uint64_t2 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } @@ -65,7 +65,7 @@ uint64_t2 test_uint64_t2(uint64_t2 expr) { // CHECK-LABEL: test_uint64_t3 uint64_t3 test_uint64_t3(uint64_t3 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } @@ -74,7 +74,7 @@ uint64_t3 test_uint64_t3(uint64_t3 expr) { // CHECK-LABEL: test_uint64_t4 uint64_t4 test_uint64_t4(uint64_t4 expr) { - // CHECK: %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i64([[TY]] %[[#]]) + // CHECK: %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveBitXor(expr); } diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl index be05a17cc3692..a4628ad103e0d 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl @@ -9,7 +9,7 @@ // CHECK-LABEL: test_int int test_int(int expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.max.i32([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.max.i32([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.max.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveMax(expr); @@ -20,7 +20,7 @@ int test_int(int expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umax.i64([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.umax.i64([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umax.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveMax(expr); @@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_floatv4 float4 test_floatv4(float4 expr) { - // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.max.v4f32([[TY1]] %[[#]] + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.reduce.max.v4f32([[TY1]] %[[#]] // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.max.v4f32([[TY1]] %[[#]]) // CHECK: ret [[TY1]] %[[RET1]] return WaveActiveMax(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl index 1194f842deed6..f2e3686947f51 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl @@ -9,7 +9,7 @@ // CHECK-LABEL: test_int int test_int(int expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.min.i32([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.min.i32([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.min.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveMin(expr); @@ -20,7 +20,7 @@ int test_int(int expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umin.i64([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.umin.i64([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umin.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveMin(expr); @@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_floatv4 float4 test_floatv4(float4 expr) { - // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.min.v4f32([[TY1]] %[[#]] + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.reduce.min.v4f32([[TY1]] %[[#]] // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.min.v4f32([[TY1]] %[[#]]) // CHECK: ret [[TY1]] %[[RET1]] return WaveActiveMin(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl index 3a8320e7333fc..0247b7cbeb0f6 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl @@ -9,7 +9,7 @@ // CHECK-LABEL: test_int int test_int(int expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.product.i32([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.product.i32([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.product.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveProduct(expr); @@ -20,7 +20,7 @@ int test_int(int expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.product.i64([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.product.i64([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.uproduct.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveProduct(expr); @@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_floatv4 float4 test_floatv4(float4 expr) { - // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.product.v4f32([[TY1]] %[[#]] + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.product.v4f32([[TY1]] %[[#]] // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.product.v4f32([[TY1]] %[[#]]) // CHECK: ret [[TY1]] %[[RET1]] return WaveActiveProduct(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl index 1fc93c62c8db0..6caa3d775f0d2 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl @@ -9,7 +9,7 @@ // CHECK-LABEL: test_int int test_int(int expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.sum.i32([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.sum.i32([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.sum.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveSum(expr); @@ -20,7 +20,7 @@ int test_int(int expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.sum.i64([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.sum.i64([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.usum.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveActiveSum(expr); @@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_floatv4 float4 test_floatv4(float4 expr) { - // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]] %[[#]] + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]] %[[#]] // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.sum.v4f32([[TY1]] %[[#]]) // CHECK: ret [[TY1]] %[[RET1]] return WaveActiveSum(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl b/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl index 25d9074b08a68..bfd42740ac4ed 100644 --- a/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl @@ -18,7 +18,7 @@ int test_int(bool expr) { // CHECK: %[[LOADEDVAL:.*]] = load i32, ptr %[[EXPRADDR]], align 4 // CHECK: %[[TRUNCLOADEDVAL:.*]] = icmp ne i32 %[[LOADEDVAL]], 0 - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.subgroup.prefix.bit.count(i1 %[[TRUNCLOADEDVAL]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.subgroup.prefix.bit.count(i1 %[[TRUNCLOADEDVAL]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.bit.count(i1 %[[TRUNCLOADEDVAL]]) // CHECK: ret [[TY]] %[[RET]] return WavePrefixCountBits(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl b/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl index a45cbf29b87f2..a4dc01527a7f2 100644 --- a/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl @@ -9,7 +9,7 @@ // CHECK-LABEL: test_int int test_int(int expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.product.i32([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.product.i32([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.product.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WavePrefixProduct(expr); @@ -20,7 +20,7 @@ int test_int(int expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.product.i64([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.product.i64([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.uproduct.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WavePrefixProduct(expr); @@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_floatv4 float4 test_floatv4(float4 expr) { - // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.prefix.product.v4f32([[TY1]] %[[#]] + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.prefix.product.v4f32([[TY1]] %[[#]] // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.prefix.product.v4f32([[TY1]] %[[#]]) // CHECK: ret [[TY1]] %[[RET1]] return WavePrefixProduct(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl b/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl index f22aa69ba45d5..a1df3fe02c802 100644 --- a/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl @@ -9,7 +9,7 @@ // CHECK-LABEL: test_int int test_int(int expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.sum.i32([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.sum.i32([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.sum.i32([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WavePrefixSum(expr); @@ -20,7 +20,7 @@ int test_int(int expr) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr) { - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.sum.i64([[TY]] %[[#]]) + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.sum.i64([[TY]] %[[#]]) // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.usum.i64([[TY]] %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WavePrefixSum(expr); @@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) { // CHECK-LABEL: test_floatv4 float4 test_floatv4(float4 expr) { - // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.prefix.sum.v4f32([[TY1]] %[[#]] + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.prefix.sum.v4f32([[TY1]] %[[#]] // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.prefix.sum.v4f32([[TY1]] %[[#]]) // CHECK: ret [[TY1]] %[[RET1]] return WavePrefixSum(expr); diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl index da6cbc40a79bb..24252f3fa3207 100644 --- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl +++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl @@ -10,7 +10,7 @@ // CHECK-LABEL: test_int int test_int(int expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -22,7 +22,7 @@ int test_int(int expr, uint idx) { // CHECK-LABEL: test_uint uint test_uint(uint expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -31,7 +31,7 @@ uint test_uint(uint expr, uint idx) { // CHECK-LABEL: test_int64_t int64_t test_int64_t(int64_t expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -43,7 +43,7 @@ int64_t test_int64_t(int64_t expr, uint idx) { // CHECK-LABEL: test_uint64_t uint64_t test_uint64_t(uint64_t expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ] // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -53,7 +53,7 @@ uint64_t test_uint64_t(uint64_t expr, uint idx) { // CHECK-LABEL: test_int16 int16_t test_int16(int16_t expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ] // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -65,7 +65,7 @@ int16_t test_int16(int16_t expr, uint idx) { // CHECK-LABEL: test_uint16 uint16_t test_uint16(uint16_t expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ] // CHECK-DXIL: %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -77,7 +77,7 @@ uint16_t test_uint16(uint16_t expr, uint idx) { // CHECK-LABEL: test_half half test_half(half expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok2:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ] // CHECK-DXIL: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.dx.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -89,7 +89,7 @@ half test_half(half expr, uint idx) { // CHECK-LABEL: test_double double test_double(double expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok3:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ] + // CHECK-SPIRV: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ] // CHECK-DXIL: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.dx.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY]] %[[RET]] return WaveReadLaneAt(expr, idx); @@ -101,7 +101,7 @@ double test_double(double expr, uint idx) { // CHECK-LABEL: test_floatv4 float4 test_floatv4(float4 expr, uint idx) { // CHECK-SPIRV: %[[#entry_tok4:]] = call token @llvm.experimental.convergence.entry() - // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ] + // CHECK-SPIRV: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ] // CHECK-DXIL: %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) // CHECK: ret [[TY1]] %[[RET1]] return WaveReadLaneAt(expr, idx); diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl index 8072f6d4ea206..fdf019262d8cb 100644 --- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl +++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl @@ -14,13 +14,13 @@ void main() { while (a) { // CHECK-DXIL: %[[#]] = call i32 @llvm.dx.wave.get.lane.count() -// CHECK-SPIRV: %[[#]] = call spir_func i32 @llvm.spv.wave.get.lane.count() +// CHECK-SPIRV: %[[#]] = call i32 @llvm.spv.wave.get.lane.count() // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ] a = WaveGetLaneCount(); } // CHECK-DXIL: %[[#]] = call i32 @llvm.dx.wave.get.lane.count() -// CHECK-SPIRV: %[[#]] = call spir_func i32 @llvm.spv.wave.get.lane.count() +// CHECK-SPIRV: %[[#]] = call i32 @llvm.spv.wave.get.lane.count() // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ] b = WaveGetLaneCount(); } diff --git a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl index 2fb6defb896f9..18860c321eb91 100644 --- a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl +++ b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl @@ -13,7 +13,7 @@ void main() { while (true) { // CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane() -// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane() +// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane() // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ] if (WaveIsFirstLane()) { break; @@ -21,7 +21,7 @@ void main() { } // CHECK-DXIL: %[[#]] = call i1 @llvm.dx.wave.is.first.lane() -// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane() +// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane() // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ] if (WaveIsFirstLane()) { return; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 0b4cdd0c2c28f..f0531bef642b0 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -1296,7 +1296,7 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1322,7 +1322,7 @@ void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1348,7 +1348,7 @@ void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1374,7 +1374,7 @@ void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: ret void @@ -1514,7 +1514,7 @@ void test_s_wakeup_barrier(void *bar) // CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]] // CHECK-NEXT: ret float [[TMP2]] // float test_global_add_f32(global float *addr, float x) { @@ -1531,7 +1531,7 @@ float test_global_add_f32(global float *addr, float x) { // CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: ret <2 x half> [[TMP2]] // half2 test_global_add_half2(global half2 *addr, half2 x) { @@ -1548,7 +1548,7 @@ half2 test_global_add_half2(global half2 *addr, half2 x) { // CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: ret <2 x half> [[TMP2]] // half2 test_flat_add_2f16(generic half2 *addr, half2 x) { @@ -1566,7 +1566,7 @@ half2 test_flat_add_2f16(generic half2 *addr, half2 x) { // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> -// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> // CHECK-NEXT: ret <2 x i16> [[TMP4]] // @@ -1585,7 +1585,7 @@ short2 test_flat_add_2bf16(generic short2 *addr, short2 x) { // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat> -// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] +// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16> // CHECK-NEXT: ret <2 x i16> [[TMP4]] // diff --git a/clang/test/Driver/Inputs/SYCL/two-kernels.ll b/clang/test/Driver/Inputs/SYCL/two-kernels.ll deleted file mode 100644 index c3c90444b7e72..0000000000000 --- a/clang/test/Driver/Inputs/SYCL/two-kernels.ll +++ /dev/null @@ -1,23 +0,0 @@ -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" -target triple = "spirv64" - -define spir_func i32 @helper_shared(i32 %a) { -entry: - %r = add nsw i32 %a, 1 - ret i32 %r -} - -define spir_kernel void @kernel_a(ptr addrspace(1) %out, i32 %a) { -entry: - %r = tail call spir_func i32 @helper_shared(i32 %a) - store i32 %r, ptr addrspace(1) %out, align 4 - ret void -} - -define spir_kernel void @kernel_b(ptr addrspace(1) %out, i32 %a, i32 %b) { -entry: - %h = tail call spir_func i32 @helper_shared(i32 %a) - %r = mul nsw i32 %h, %b - store i32 %r, ptr addrspace(1) %out, align 4 - ret void -} diff --git a/clang/test/Driver/clang-sycl-linker-test.cpp b/clang/test/Driver/clang-sycl-linker-test.cpp deleted file mode 100644 index cd99d4d47b1e1..0000000000000 --- a/clang/test/Driver/clang-sycl-linker-test.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// Tests the clang-sycl-linker tool. -// -// REQUIRES: spirv-registered-target -// -// Test the dry run of a simple case to link two input files. -// Also verifies the default split mode ("none"). -// RUN: %clangxx -emit-llvm -c -target spirv64 %s -o %t_1.bc -// RUN: %clangxx -emit-llvm -c -target spirv64 %s -o %t_2.bc -// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t_1.bc %t_2.bc -o %t-spirv.out 2>&1 \ -// RUN: | FileCheck %s --check-prefix=SIMPLE-FO -// SIMPLE-FO: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc -// SIMPLE-FO-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none -// SIMPLE-FO-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv -// -// Test that IMG_SPIRV image kind is set for non-AOT compilation. -// RUN: llvm-objdump --offloading %t-spirv.out | FileCheck %s --check-prefix=IMAGE-KIND-SPIRV -// IMAGE-KIND-SPIRV: kind spir-v -// -// Test the dry run of a simple case with device library files specified. -// RUN: mkdir -p %t.dir -// RUN: touch %t.dir/lib1.bc -// RUN: touch %t.dir/lib2.bc -// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t_1.bc %t_2.bc --library-path=%t.dir --device-libs=lib1.bc,lib2.bc -o a.spv 2>&1 \ -// RUN: | FileCheck %s --check-prefix=DEVLIBS -// DEVLIBS: sycl-device-link: inputs: {{.*}}.bc libfiles: {{.*}}lib1.bc, {{.*}}lib2.bc output: [[LLVMLINKOUT:.*]].bc -// DEVLIBS-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none -// DEVLIBS-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: a_0.spv -// -// Test a simple case with a random file (not bitcode) as input. -// RUN: touch %t.o -// RUN: not clang-sycl-linker -triple=spirv64 %t.o -o a.spv 2>&1 \ -// RUN: | FileCheck %s --check-prefix=FILETYPEERROR -// FILETYPEERROR: Unsupported file type -// -// Test to see if device library related errors are emitted. -// RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t_1.bc %t_2.bc --library-path=%t.dir --device-libs= -o a.spv 2>&1 \ -// RUN: | FileCheck %s --check-prefix=DEVLIBSERR1 -// DEVLIBSERR1: Number of device library files cannot be zero -// RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t_1.bc %t_2.bc --library-path=%t.dir --device-libs=lib1.bc,lib2.bc,lib3.bc -o a.spv 2>&1 \ -// RUN: | FileCheck %s --check-prefix=DEVLIBSERR2 -// DEVLIBSERR2: '{{.*}}lib3.bc' SYCL device library file is not found -// -// Test AOT compilation for an Intel GPU. -// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=bmg_g21 %t_1.bc %t_2.bc -o %t-aot-gpu.out 2>&1 \ -// RUN: --ocloc-options="-a -b" \ -// RUN: | FileCheck %s --check-prefix=AOT-INTEL-GPU -// AOT-INTEL-GPU: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc -// AOT-INTEL-GPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none -// AOT-INTEL-GPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv -// AOT-INTEL-GPU-NEXT: "{{.*}}ocloc{{.*}}" {{.*}}-device bmg_g21 -a -b {{.*}}-output [[SPIRVTRANSLATIONOUT]]_0.out -file [[SPIRVTRANSLATIONOUT]]_0.spv -// -// Test that IMG_Object image kind is set for AOT compilation (Intel GPU). -// RUN: llvm-objdump --offloading %t-aot-gpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT -// IMAGE-KIND-OBJECT: kind elf -// -// Test AOT compilation for an Intel CPU. -// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=graniterapids %t_1.bc %t_2.bc -o %t-aot-cpu.out 2>&1 \ -// RUN: --opencl-aot-options="-a -b" \ -// RUN: | FileCheck %s --check-prefix=AOT-INTEL-CPU -// AOT-INTEL-CPU: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc -// AOT-INTEL-CPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none -// AOT-INTEL-CPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv -// AOT-INTEL-CPU-NEXT: "{{.*}}opencl-aot{{.*}}" {{.*}}--device=cpu -a -b {{.*}}-o [[SPIRVTRANSLATIONOUT]]_0.out [[SPIRVTRANSLATIONOUT]]_0.spv -// -// Test that IMG_Object image kind is set for AOT compilation (Intel CPU). -// RUN: llvm-objdump --offloading %t-aot-cpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT -// -// Check that the output file must be specified. -// RUN: not clang-sycl-linker --dry-run %t_1.bc %t_2.bc 2>&1 \ -// RUN: | FileCheck %s --check-prefix=NOOUTPUT -// NOOUTPUT: Output file must be specified -// -// Check that the target triple must be specified. -// RUN: not clang-sycl-linker --dry-run %t_1.bc %t_2.bc -o a.out 2>&1 \ -// RUN: | FileCheck %s --check-prefix=NOTARGET -// NOTARGET: Target triple must be specified -// -// Test the split mode ("none"): no extra splits are produced. -// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=none %t_1.bc %t_2.bc -o %t-split-none.out 2>&1 \ -// RUN: | FileCheck %s --check-prefix=SPLIT-NONE -// SPLIT-NONE: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc -// SPLIT-NONE-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none -// SPLIT-NONE-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv -// SPLIT-NONE-NOT: LLVM backend: input: {{.*}}.bc, output: {{.*}}_1.spv -// -// Test per-kernel split: a module with two SPIR_KERNEL functions produces two -// device images. -// RUN: llvm-as %S/Inputs/SYCL/two-kernels.ll -o %t-two.bc -// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=kernel %t-two.bc -o %t-split-kernel.out 2>&1 \ -// RUN: | FileCheck %s --check-prefix=SPLIT-KERNEL -// SPLIT-KERNEL: sycl-device-link: inputs: {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc -// SPLIT-KERNEL-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[SPLIT0:.*]].bc, [[SPLIT1:.*]].bc, mode: kernel -// SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT0]].bc, output: {{.*}}_0.spv -// SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT1]].bc, output: {{.*}}_1.spv -// -// Test that an invalid split mode is rejected. -// RUN: not clang-sycl-linker --dry-run -triple=spirv64 --module-split-mode=bogus %t_1.bc -o a.out 2>&1 \ -// RUN: | FileCheck %s --check-prefix=SPLIT-INVALID -// SPLIT-INVALID: module-split-mode value isn't recognized: bogus diff --git a/clang/test/Tooling/clang-sycl-linker-split-mode.ll b/clang/test/Tooling/clang-sycl-linker-split-mode.ll new file mode 100644 index 0000000000000..2b4b1cee4e171 --- /dev/null +++ b/clang/test/Tooling/clang-sycl-linker-split-mode.ll @@ -0,0 +1,51 @@ +; Tests the clang-sycl-linker tool: device code splitting. +; +; REQUIRES: spirv-registered-target +; +; RUN: llvm-as %s -o %t.bc +; +; Test that an invalid split mode is rejected. +; RUN: not clang-sycl-linker --dry-run -triple=spirv64 --module-split-mode=bogus %t.bc -o a.out 2>&1 \ +; RUN: | FileCheck %s --check-prefix=SPLIT-INVALID +; SPLIT-INVALID: module-split-mode value isn't recognized: bogus +; +; Test the split mode ("none"): no extra splits are produced. +; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=none %t.bc -o %t-none.out 2>&1 \ +; RUN: | FileCheck %s --check-prefix=SPLIT-NONE +; SPLIT-NONE: sycl-device-link: inputs: {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc +; SPLIT-NONE-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none +; SPLIT-NONE-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv +; SPLIT-NONE-NOT: LLVM backend: input: {{.*}}.bc, output: {{.*}}_1.spv +; +; Test per-kernel split: a module with two SPIR_KERNEL functions produces two +; device images. +; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=kernel %t.bc -o %t-split-kernel.out 2>&1 \ +; RUN: | FileCheck %s --check-prefix=SPLIT-KERNEL +; SPLIT-KERNEL: sycl-device-link: inputs: {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc +; SPLIT-KERNEL-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[SPLIT0:.*]].bc, [[SPLIT1:.*]].bc, mode: kernel +; SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT0]].bc, output: {{.*}}_0.spv +; SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT1]].bc, output: {{.*}}_1.spv + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spirv64" + +define spir_func i32 @helper_shared(i32 %a) { +entry: + %r = add nsw i32 %a, 1 + ret i32 %r +} + +define spir_kernel void @kernel_a(ptr addrspace(1) %out, i32 %a) { +entry: + %r = tail call spir_func i32 @helper_shared(i32 %a) + store i32 %r, ptr addrspace(1) %out, align 4 + ret void +} + +define spir_kernel void @kernel_b(ptr addrspace(1) %out, i32 %a, i32 %b) { +entry: + %h = tail call spir_func i32 @helper_shared(i32 %a) + %r = mul nsw i32 %h, %b + store i32 %r, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/clang/test/Tooling/clang-sycl-linker.ll b/clang/test/Tooling/clang-sycl-linker.ll new file mode 100644 index 0000000000000..cf0fb33d1bc06 --- /dev/null +++ b/clang/test/Tooling/clang-sycl-linker.ll @@ -0,0 +1,97 @@ +; Tests the clang-sycl-linker tool. +; +; REQUIRES: spirv-registered-target +; +; RUN: rm -rf %t && split-file %s %t +; RUN: llvm-as %t/input1.ll -o %t/input1.bc +; RUN: llvm-as %t/input2.ll -o %t/input2.bc +; +; Test the dry run of a simple case to link two input files. +; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t/input1.bc %t/input2.bc -o %t/spirv.out 2>&1 \ +; RUN: | FileCheck %s --check-prefix=SIMPLE-FO +; SIMPLE-FO: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc +; SIMPLE-FO-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none +; SIMPLE-FO-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv +; +; Test that IMG_SPIRV image kind is set for non-AOT compilation. +; RUN: llvm-objdump --offloading %t/spirv.out | FileCheck %s --check-prefix=IMAGE-KIND-SPIRV +; IMAGE-KIND-SPIRV: kind spir-v +; +; Test the dry run of a simple case with device library files specified. +; RUN: mkdir -p %t/libs +; RUN: touch %t/libs/lib1.bc +; RUN: touch %t/libs/lib2.bc +; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t/input1.bc %t/input2.bc --library-path=%t/libs --device-libs=lib1.bc,lib2.bc -o a.spv 2>&1 \ +; RUN: | FileCheck %s --check-prefix=DEVLIBS +; DEVLIBS: sycl-device-link: inputs: {{.*}}.bc libfiles: {{.*}}lib1.bc, {{.*}}lib2.bc output: [[LLVMLINKOUT:.*]].bc +; DEVLIBS-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none +; DEVLIBS-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: a_0.spv +; +; Test a simple case with a random file (not bitcode) as input. +; RUN: touch %t/dummy.o +; RUN: not clang-sycl-linker -triple=spirv64 %t/dummy.o -o a.spv 2>&1 \ +; RUN: | FileCheck %s --check-prefix=FILETYPEERROR +; FILETYPEERROR: Unsupported file type +; +; Test to see if device library related errors are emitted. +; RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t/input1.bc %t/input2.bc --library-path=%t/libs --device-libs= -o a.spv 2>&1 \ +; RUN: | FileCheck %s --check-prefix=DEVLIBSERR1 +; DEVLIBSERR1: Number of device library files cannot be zero +; RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t/input1.bc %t/input2.bc --library-path=%t/libs --device-libs=lib1.bc,lib2.bc,lib3.bc -o a.spv 2>&1 \ +; RUN: | FileCheck %s --check-prefix=DEVLIBSERR2 +; DEVLIBSERR2: '{{.*}}lib3.bc' SYCL device library file is not found +; +; Test AOT compilation for an Intel GPU. +; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=bmg_g21 %t/input1.bc %t/input2.bc -o %t/aot-gpu.out 2>&1 \ +; RUN: --ocloc-options="-a -b" \ +; RUN: | FileCheck %s --check-prefix=AOT-INTEL-GPU +; AOT-INTEL-GPU: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc +; AOT-INTEL-GPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none +; AOT-INTEL-GPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv +; AOT-INTEL-GPU-NEXT: "{{.*}}ocloc{{.*}}" {{.*}}-device bmg_g21 -a -b {{.*}}-output [[SPIRVTRANSLATIONOUT]]_0.out -file [[SPIRVTRANSLATIONOUT]]_0.spv +; +; Test that IMG_Object image kind is set for AOT compilation (Intel GPU). +; RUN: llvm-objdump --offloading %t/aot-gpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT +; IMAGE-KIND-OBJECT: kind elf +; +; Test AOT compilation for an Intel CPU. +; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=graniterapids %t/input1.bc %t/input2.bc -o %t/aot-cpu.out 2>&1 \ +; RUN: --opencl-aot-options="-a -b" \ +; RUN: | FileCheck %s --check-prefix=AOT-INTEL-CPU +; AOT-INTEL-CPU: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc +; AOT-INTEL-CPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none +; AOT-INTEL-CPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv +; AOT-INTEL-CPU-NEXT: "{{.*}}opencl-aot{{.*}}" {{.*}}--device=cpu -a -b {{.*}}-o [[SPIRVTRANSLATIONOUT]]_0.out [[SPIRVTRANSLATIONOUT]]_0.spv +; +; Test that IMG_Object image kind is set for AOT compilation (Intel CPU). +; RUN: llvm-objdump --offloading %t/aot-cpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT +; +; Check that the output file must be specified. +; RUN: not clang-sycl-linker --dry-run %t/input1.bc %t/input2.bc 2>&1 \ +; RUN: | FileCheck %s --check-prefix=NOOUTPUT +; NOOUTPUT: Output file must be specified +; +; Check that the target triple must be specified. +; RUN: not clang-sycl-linker --dry-run %t/input1.bc %t/input2.bc -o a.out 2>&1 \ +; RUN: | FileCheck %s --check-prefix=NOTARGET +; NOTARGET: Target triple must be specified + +;--- input1.ll +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spirv64" + +define spir_kernel void @kernel_a() #0 { + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } + +;--- input2.ll +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spirv64" + +define spir_kernel void @kernel_b() #0 { + ret void +} + +attributes #0 = { "sycl-module-id"="TU2.cpp" } diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index f731922030777..eeaf5d3f66d96 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -927,6 +927,18 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("AlwaysBreakAfterReturnType: TopLevelDefinitions", BreakAfterReturnType, FormatStyle::RTBS_TopLevelDefinitions); + Style.BreakBeforeReturnType = FormatStyle::BBRTS_All; + CHECK_PARSE("BreakBeforeReturnType: None", BreakBeforeReturnType, + FormatStyle::BBRTS_None); + CHECK_PARSE("BreakBeforeReturnType: All", BreakBeforeReturnType, + FormatStyle::BBRTS_All); + CHECK_PARSE("BreakBeforeReturnType: TopLevel", BreakBeforeReturnType, + FormatStyle::BBRTS_TopLevel); + CHECK_PARSE("BreakBeforeReturnType: AllDefinitions", BreakBeforeReturnType, + FormatStyle::BBRTS_AllDefinitions); + CHECK_PARSE("BreakBeforeReturnType: TopLevelDefinitions", + BreakBeforeReturnType, FormatStyle::BBRTS_TopLevelDefinitions); + Style.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; CHECK_PARSE("BreakTemplateDeclarations: Leave", BreakTemplateDeclarations, FormatStyle::BTDS_Leave); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 4245bd1c58153..54529a3d4e590 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -3139,14 +3139,18 @@ TEST_F(FormatTest, FormatsLabels) { "}"); verifyFormat("{\n" " some_code();\n" - "test_label: { some_other_code(); }\n" + "test_label:\n" + " {\n" + " some_other_code();\n" + " }\n" "}"); verifyFormat("{\n" " some_code();\n" - "test_label: {\n" - " some_other_code();\n" - " some_other_code();\n" - "}\n" + "test_label:\n" + " {\n" + " some_other_code();\n" + " some_other_code();\n" + " }\n" "}"); verifyFormat("{\n" "L0:\n" @@ -3155,10 +3159,11 @@ TEST_F(FormatTest, FormatsLabels) { " g();\n" "}"); verifyFormat("{\n" - "[[foo]] L1: {\n" - "[[bar]] [[baz]] L2:\n" - " g();\n" - "}\n" + "[[foo]] L1:\n" + " {\n" + " [[bar]] [[baz]] L2:\n" + " g();\n" + " }\n" "}"); verifyFormat("{\n" "[[foo]] L1:\n" @@ -3168,6 +3173,18 @@ TEST_F(FormatTest, FormatsLabels) { " g();\n" " }\n" "}"); + verifyFormat("void func() {\n" + "label:\n" + " {\n" + " // Block\n" + " }\n" + "}"); + verifyFormat("void func() {\n" + "label: // Comment\n" + " {\n" + " // Block\n" + " }\n" + "}"); FormatStyle Style = getLLVMStyle(); Style.IndentGotoLabels = FormatStyle::IGLS_NoIndent; @@ -3196,7 +3213,10 @@ TEST_F(FormatTest, FormatsLabels) { Style); verifyFormat("{\n" " some_code();\n" - "test_label: { some_other_code(); }\n" + "test_label:\n" + " {\n" + " some_other_code();\n" + " }\n" "}", Style); verifyFormat("{\n" @@ -3331,17 +3351,17 @@ TEST_F(FormatTest, FormatsLabels) { verifyFormat("{\n" " some_code();\n" "test_label:\n" - "{\n" - " some_other_code();\n" - "}\n" + " {\n" + " some_other_code();\n" + " }\n" "}", Style); verifyFormat("{\n" "[[foo]] L1:\n" - "{\n" - "[[bar]] [[baz]] L2:\n" - " g();\n" - "}\n" + " {\n" + " [[bar]] [[baz]] L2:\n" + " g();\n" + " }\n" "}", Style); } @@ -10594,6 +10614,172 @@ TEST_F(FormatTest, ReturnTypeBreakingStyle) { verifyFormat("void foo (int a, int b);", Style); } +TEST_F(FormatTest, BreakBeforeReturnType) { + FormatStyle Style = getLLVMStyle(); + Style.BreakBeforeReturnType = FormatStyle::BBRTS_All; + + verifyFormat("static inline\n" + "void myfun(void);", + Style); + verifyFormat("static\n" + "int x(void);", + Style); + + verifyFormat("void f(void);", Style); + verifyFormat("int g(int a);", Style); + + // Constructors and destructors are not affected. + verifyFormat("class C {\n" + " explicit C(int);\n" + " virtual ~C();\n" + "};", + Style); + + verifyFormat("__attribute__((always_inline)) static inline\n" + "void f(void);", + Style); + verifyFormat("static __forceinline\n" + "void f(void);", + Style); + verifyFormat("export\n" + "int f();", + Style); + verifyFormat( + "__attribute__((section(\".init\"), always_inline)) static inline\n" + "int boot(void);", + Style); + verifyFormat("[[nodiscard]] static constexpr\n" + "int f();", + Style); + verifyFormat("static\n" + "const struct foo *g(void);", + Style); + verifyFormat("class A {\n" + " friend\n" + " int f();\n" + "};", + Style); + + verifyFormat("static int x = 0;", Style); + verifyFormat("static const char *msg;", Style); + + verifyFormat("static\n" + "auto f() -> int;", + Style); + + Style.ColumnLimit = 50; + verifyFormat("__attribute__((always_inline)) static inline\n" + "int do_thing(int a, int b, int c);", + Style); + Style.ColumnLimit = 80; + + verifyFormat("static inline\n" + "int compute(int x) {\n" + " ++x;\n" + " return x;\n" + "}", + Style); + + Style.BreakAfterReturnType = FormatStyle::RTBS_All; + verifyFormat("static inline\n" + "void\n" + "f(void);", + Style); + Style.BreakAfterReturnType = FormatStyle::RTBS_None; + + Style.BreakAfterAttributes = FormatStyle::ABS_Always; + verifyFormat("[[nodiscard]]\n" + "static\n" + "int f();", + Style); + Style.BreakAfterAttributes = FormatStyle::ABS_Leave; + + Style.BreakTemplateDeclarations = FormatStyle::BTDS_Yes; + verifyFormat("template \n" + "static inline\n" + "T f();", + Style); + verifyFormat("template \n" + " requires Foo\n" + "static inline\n" + "T f();", + Style); + Style.BreakTemplateDeclarations = FormatStyle::BTDS_Leave; + + Style.BreakBeforeReturnType = FormatStyle::BBRTS_AllDefinitions; + verifyFormat("class A {\n" + " static inline int member();\n" + " static inline\n" + " int member_def() {\n" + " return 0;\n" + " }\n" + "};\n" + "static inline int top_decl();\n" + "static inline\n" + "int top_defn() {\n" + " ++x;\n" + " return 0;\n" + "}", + Style); + + Style.BreakBeforeReturnType = FormatStyle::BBRTS_TopLevel; + verifyFormat("class A {\n" + " static inline int member();\n" + " static inline int member_def() { return 0; }\n" + "};\n" + "static inline\n" + "int top_decl();\n" + "static inline\n" + "int top_defn() {\n" + " ++x;\n" + " return 0;\n" + "}", + Style); + + Style.BreakBeforeReturnType = FormatStyle::BBRTS_TopLevelDefinitions; + verifyFormat("class A {\n" + " static inline int member();\n" + " static inline int member_def() { return 0; }\n" + "};\n" + "static inline int top_decl();\n" + "static inline\n" + "int top_defn() {\n" + " ++x;\n" + " return 0;\n" + "}", + Style); + + Style.BreakBeforeReturnType = FormatStyle::BBRTS_All; + + Style.AttributeMacros = {"__always_inline"}; + verifyFormat("__always_inline\n" + "void f(void);", + Style); + + Style.AttributeMacros = {"__always_inline", "LIBC_INLINE"}; + verifyFormat("LIBC_INLINE static __always_inline\n" + "int compute(int x);", + Style); + + Style.AttributeMacros = {"ATTRIBUTE_PRINTF"}; + verifyFormat("ATTRIBUTE_PRINTF(1, 2) static\n" + "void log(const char *fmt, ...);", + Style); + + // Same identifier: unconfigured -> not a specifier; configured -> specifier. + Style.AttributeMacros = {}; + verifyFormat("FOO static void f(void);", Style); + Style.AttributeMacros = {"FOO"}; + verifyFormat("FOO static\n" + "void f(void);", + Style); + + Style.AttributeMacros = {"LIBC_INLINE"}; + verifyFormat("[[nodiscard]] __attribute__((pure)) LIBC_INLINE static\n" + "int hash(int k);", + Style); +} + TEST_F(FormatTest, AlwaysBreakBeforeMultilineStrings) { FormatStyle NoBreak = getLLVMStyle(); NoBreak.AlwaysBreakBeforeMultilineStrings = false; @@ -18572,7 +18758,10 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeColon) { "}", CaseStyle); verifyFormat("switch (x) {\n" - "goto_label: { break; }\n" + "goto_label:\n" + " {\n" + " break;\n" + " }\n" "default : {\n" " break;\n" "}\n" diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp index 23ff1158e00cf..66295b441d3ce 100644 --- a/clang/unittests/Format/FormatTestVerilog.cpp +++ b/clang/unittests/Format/FormatTestVerilog.cpp @@ -1013,6 +1013,8 @@ TEST_F(FormatTestVerilog, Instantiation) { TEST_F(FormatTestVerilog, Loop) { verifyFormat("foreach (x[x])\n" " x = x;"); + verifyFormat("(* x = \"x\" *) foreach (x[x])\n" + " x = x;"); verifyFormat("repeat (x)\n" " x = x;"); verifyFormat("foreach (x[x]) begin\n" diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index d9e4caa3ca180..dd54e261d55e7 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -102,19 +102,22 @@ check_cxx_compiler_flag(-fno-sanitize=safe-stack COMPILER_RT_HAS_FNO_SANITIZE_SA check_cxx_compiler_flag(-fvisibility=hidden COMPILER_RT_HAS_FVISIBILITY_HIDDEN_FLAG) check_cxx_compiler_flag(-frtti COMPILER_RT_HAS_FRTTI_FLAG) check_cxx_compiler_flag(-fno-rtti COMPILER_RT_HAS_FNO_RTTI_FLAG) -check_cxx_compiler_flag("-Werror -fno-function-sections" COMPILER_RT_HAS_FNO_FUNCTION_SECTIONS_FLAG) +check_cxx_compiler_flag("-Werror;-fno-function-sections" COMPILER_RT_HAS_FNO_FUNCTION_SECTIONS_FLAG) check_cxx_compiler_flag(-ftls-model=initial-exec COMPILER_RT_HAS_FTLS_MODEL_INITIAL_EXEC) check_cxx_compiler_flag(-fno-lto COMPILER_RT_HAS_FNO_LTO_FLAG) check_cxx_compiler_flag(-fno-profile-generate COMPILER_RT_HAS_FNO_PROFILE_GENERATE_FLAG) check_cxx_compiler_flag(-fno-profile-instr-generate COMPILER_RT_HAS_FNO_PROFILE_INSTR_GENERATE_FLAG) check_cxx_compiler_flag(-fno-profile-instr-use COMPILER_RT_HAS_FNO_PROFILE_INSTR_USE_FLAG) check_cxx_compiler_flag(-fno-coverage-mapping COMPILER_RT_HAS_FNO_COVERAGE_MAPPING_FLAG) -check_cxx_compiler_flag("-Werror -mcrc32" COMPILER_RT_HAS_MCRC32_FLAG) -check_cxx_compiler_flag("-Werror -msse4.2" COMPILER_RT_HAS_MSSE4_2_FLAG) +check_cxx_compiler_flag("-Werror;-mcrc32" COMPILER_RT_HAS_MCRC32_FLAG) +check_cxx_compiler_flag("-Werror;-msse4.2" COMPILER_RT_HAS_MSSE4_2_FLAG) check_cxx_compiler_flag(--sysroot=. COMPILER_RT_HAS_SYSROOT_FLAG) -check_cxx_compiler_flag("-Werror -mcrc" COMPILER_RT_HAS_MCRC_FLAG) +check_cxx_compiler_flag("-Werror;-mcrc" COMPILER_RT_HAS_MCRC_FLAG) check_cxx_compiler_flag(-fno-partial-inlining COMPILER_RT_HAS_FNO_PARTIAL_INLINING_FLAG) -check_cxx_compiler_flag("-Werror -ftrivial-auto-var-init=pattern" COMPILER_RT_HAS_TRIVIAL_AUTO_INIT) +check_cxx_compiler_flag("-Werror;-ftrivial-auto-var-init=pattern" COMPILER_RT_HAS_TRIVIAL_AUTO_INIT) +check_c_compiler_flag(-nogpulib COMPILER_RT_HAS_NOGPULIB_FLAG) +check_c_compiler_flag(-flto COMPILER_RT_HAS_FLTO_FLAG) +check_c_compiler_flag("-Xclang;-mcode-object-version=none" COMPILER_RT_HAS_CODE_OBJECT_VERSION_FLAG) if(NOT WIN32 AND NOT CYGWIN) # MinGW warns if -fvisibility-inlines-hidden is used. @@ -134,24 +137,24 @@ check_cxx_compiler_flag(/Zi COMPILER_RT_HAS_Zi_FLAG) # Warnings. check_cxx_compiler_flag(-Wall COMPILER_RT_HAS_WALL_FLAG) check_cxx_compiler_flag(-Werror COMPILER_RT_HAS_WERROR_FLAG) -check_cxx_compiler_flag("-Werror -Wframe-larger-than=512" COMPILER_RT_HAS_WFRAME_LARGER_THAN_FLAG) -check_cxx_compiler_flag("-Werror -Wglobal-constructors" COMPILER_RT_HAS_WGLOBAL_CONSTRUCTORS_FLAG) -check_cxx_compiler_flag("-Werror -Wc99-extensions" COMPILER_RT_HAS_WC99_EXTENSIONS_FLAG) -check_cxx_compiler_flag("-Werror -Wgnu" COMPILER_RT_HAS_WGNU_FLAG) -check_cxx_compiler_flag("-Werror -Wgnu-anonymous-struct" COMPILER_RT_HAS_WGNU_ANONYMOUS_STRUCT_FLAG) -check_cxx_compiler_flag("-Werror -Wvariadic-macros" COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG) -check_cxx_compiler_flag("-Werror -Wunused-parameter" COMPILER_RT_HAS_WUNUSED_PARAMETER_FLAG) -check_cxx_compiler_flag("-Werror -Wcovered-switch-default" COMPILER_RT_HAS_WCOVERED_SWITCH_DEFAULT_FLAG) -check_cxx_compiler_flag("-Werror -Wsuggest-override" COMPILER_RT_HAS_WSUGGEST_OVERRIDE_FLAG) -check_cxx_compiler_flag("-Werror -Wthread-safety" COMPILER_RT_HAS_WTHREAD_SAFETY_FLAG) -check_cxx_compiler_flag("-Werror -Wthread-safety-reference" COMPILER_RT_HAS_WTHREAD_SAFETY_REFERENCE_FLAG) -check_cxx_compiler_flag("-Werror -Wthread-safety-beta" COMPILER_RT_HAS_WTHREAD_SAFETY_BETA_FLAG) +check_cxx_compiler_flag("-Werror;-Wframe-larger-than=512" COMPILER_RT_HAS_WFRAME_LARGER_THAN_FLAG) +check_cxx_compiler_flag("-Werror;-Wglobal-constructors" COMPILER_RT_HAS_WGLOBAL_CONSTRUCTORS_FLAG) +check_cxx_compiler_flag("-Werror;-Wc99-extensions" COMPILER_RT_HAS_WC99_EXTENSIONS_FLAG) +check_cxx_compiler_flag("-Werror;-Wgnu" COMPILER_RT_HAS_WGNU_FLAG) +check_cxx_compiler_flag("-Werror;-Wgnu-anonymous-struct" COMPILER_RT_HAS_WGNU_ANONYMOUS_STRUCT_FLAG) +check_cxx_compiler_flag("-Werror;-Wvariadic-macros" COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG) +check_cxx_compiler_flag("-Werror;-Wunused-parameter" COMPILER_RT_HAS_WUNUSED_PARAMETER_FLAG) +check_cxx_compiler_flag("-Werror;-Wcovered-switch-default" COMPILER_RT_HAS_WCOVERED_SWITCH_DEFAULT_FLAG) +check_cxx_compiler_flag("-Werror;-Wsuggest-override" COMPILER_RT_HAS_WSUGGEST_OVERRIDE_FLAG) +check_cxx_compiler_flag("-Werror;-Wthread-safety" COMPILER_RT_HAS_WTHREAD_SAFETY_FLAG) +check_cxx_compiler_flag("-Werror;-Wthread-safety-reference" COMPILER_RT_HAS_WTHREAD_SAFETY_REFERENCE_FLAG) +check_cxx_compiler_flag("-Werror;-Wthread-safety-beta" COMPILER_RT_HAS_WTHREAD_SAFETY_BETA_FLAG) check_cxx_compiler_flag(-Wno-pedantic COMPILER_RT_HAS_WNO_PEDANTIC) check_cxx_compiler_flag(-Wno-format COMPILER_RT_HAS_WNO_FORMAT) check_cxx_compiler_flag(-Wno-format-pedantic COMPILER_RT_HAS_WNO_FORMAT_PEDANTIC) if(MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") - check_cxx_compiler_flag("/experimental:external /external:W0" COMPILER_RT_HAS_EXTERNAL_FLAG) + check_cxx_compiler_flag("/experimental:external;/external:W0" COMPILER_RT_HAS_EXTERNAL_FLAG) else() set(COMPILER_RT_HAS_EXTERNAL_FLAG FALSE) endif() @@ -166,21 +169,21 @@ check_cxx_compiler_flag(/wd4391 COMPILER_RT_HAS_WD4391_FLAG) check_cxx_compiler_flag(/wd4722 COMPILER_RT_HAS_WD4722_FLAG) check_cxx_compiler_flag(/wd4800 COMPILER_RT_HAS_WD4800_FLAG) -check_cxx_compiler_flag("-Werror -Warray-bounds" COMPILER_RT_HAS_ARRAY_BOUNDS_FLAG) -check_cxx_compiler_flag("-Werror -Wuninitialized" COMPILER_RT_HAS_UNINITIALIZED_FLAG) -check_cxx_compiler_flag("-Werror -Wshadow" COMPILER_RT_HAS_SHADOW_FLAG) -check_cxx_compiler_flag("-Werror -Wempty-body" COMPILER_RT_HAS_EMPTY_BODY_FLAG) -check_cxx_compiler_flag("-Werror -Wsizeof-pointer-memaccess" COMPILER_RT_HAS_SIZEOF_POINTER_MEMACCESS_FLAG) -check_cxx_compiler_flag("-Werror -Wsizeof-array-argument" COMPILER_RT_HAS_SIZEOF_ARRAY_ARGUMENT_FLAG) -check_cxx_compiler_flag("-Werror -Wsuspicious-memaccess" COMPILER_RT_HAS_SUSPICIOUS_MEMACCESS_FLAG) -check_cxx_compiler_flag("-Werror -Wbuiltin-memcpy-chk-size" COMPILER_RT_HAS_BUILTIN_MEMCPY_CHK_SIZE_FLAG) -check_cxx_compiler_flag("-Werror -Warray-bounds-pointer-arithmetic" COMPILER_RT_HAS_ARRAY_BOUNDS_POINTER_ARITHMETIC_FLAG) -check_cxx_compiler_flag("-Werror -Wreturn-stack-address" COMPILER_RT_HAS_RETURN_STACK_ADDRESS_FLAG) -check_cxx_compiler_flag("-Werror -Wsizeof-array-decay" COMPILER_RT_HAS_SIZEOF_ARRAY_DECAY_FLAG) -check_cxx_compiler_flag("-Werror -Wformat-insufficient-args" COMPILER_RT_HAS_FORMAT_INSUFFICIENT_ARGS_FLAG) -check_cxx_compiler_flag("-Werror -Wformat-security" COMPILER_RT_HAS_BUILTIN_FORMAL_SECURITY_FLAG) -check_cxx_compiler_flag("-Werror -Wsizeof-array-div" COMPILER_RT_HAS_SIZEOF_ARRAY_DIV_FLAG) -check_cxx_compiler_flag("-Werror -Wsizeof-pointer-div" COMPILER_RT_HAS_SIZEOF_POINTER_DIV_FLAG) +check_cxx_compiler_flag("-Werror;-Warray-bounds" COMPILER_RT_HAS_ARRAY_BOUNDS_FLAG) +check_cxx_compiler_flag("-Werror;-Wuninitialized" COMPILER_RT_HAS_UNINITIALIZED_FLAG) +check_cxx_compiler_flag("-Werror;-Wshadow" COMPILER_RT_HAS_SHADOW_FLAG) +check_cxx_compiler_flag("-Werror;-Wempty-body" COMPILER_RT_HAS_EMPTY_BODY_FLAG) +check_cxx_compiler_flag("-Werror;-Wsizeof-pointer-memaccess" COMPILER_RT_HAS_SIZEOF_POINTER_MEMACCESS_FLAG) +check_cxx_compiler_flag("-Werror;-Wsizeof-array-argument" COMPILER_RT_HAS_SIZEOF_ARRAY_ARGUMENT_FLAG) +check_cxx_compiler_flag("-Werror;-Wsuspicious-memaccess" COMPILER_RT_HAS_SUSPICIOUS_MEMACCESS_FLAG) +check_cxx_compiler_flag("-Werror;-Wbuiltin-memcpy-chk-size" COMPILER_RT_HAS_BUILTIN_MEMCPY_CHK_SIZE_FLAG) +check_cxx_compiler_flag("-Werror;-Warray-bounds-pointer-arithmetic" COMPILER_RT_HAS_ARRAY_BOUNDS_POINTER_ARITHMETIC_FLAG) +check_cxx_compiler_flag("-Werror;-Wreturn-stack-address" COMPILER_RT_HAS_RETURN_STACK_ADDRESS_FLAG) +check_cxx_compiler_flag("-Werror;-Wsizeof-array-decay" COMPILER_RT_HAS_SIZEOF_ARRAY_DECAY_FLAG) +check_cxx_compiler_flag("-Werror;-Wformat-insufficient-args" COMPILER_RT_HAS_FORMAT_INSUFFICIENT_ARGS_FLAG) +check_cxx_compiler_flag("-Werror;-Wformat-security" COMPILER_RT_HAS_BUILTIN_FORMAL_SECURITY_FLAG) +check_cxx_compiler_flag("-Werror;-Wsizeof-array-div" COMPILER_RT_HAS_SIZEOF_ARRAY_DIV_FLAG) +check_cxx_compiler_flag("-Werror;-Wsizeof-pointer-div" COMPILER_RT_HAS_SIZEOF_POINTER_DIV_FLAG) # Symbols. check_symbol_exists(__func__ "" COMPILER_RT_HAS_FUNC_SYMBOL) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 55fb7d6c3f101..d661fa8a558fd 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -469,6 +469,14 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm") arm/mulsf3.S arm/divsf3.S arm/adddf3.S + arm/muldf3.S + arm/divdf3.S + arm/cmpdf2.S + arm/cmpsf2.S + arm/gedf2.S + arm/gesf2.S + arm/unorddf2.S + arm/unordsf2.S ) set_source_files_properties(${assembly_files} PROPERTIES COMPILE_OPTIONS ${implicit_it_flag}) @@ -512,7 +520,6 @@ set(arm_sync_SOURCES set(thumb1_base_SOURCES arm/divsi3.S arm/udivsi3.S - arm/comparesf2.S arm/addsf3.S ${GENERIC_SOURCES} ) @@ -521,11 +528,28 @@ set_special_properties(arm/adddf3.S SUPERSEDES subdf3.c PROVIDES subdf3) if(COMPILER_RT_ARM_OPTIMIZED_FP) set(thumb1_base_SOURCES arm/thumb1/mulsf3.S + arm/thumb1/cmpdf2.S + arm/thumb1/cmpsf2.S + arm/thumb1/gedf2.S + arm/thumb1/gesf2.S + arm/thumb1/unorddf2.S + arm/thumb1/unordsf2.S arm/fnan2.c arm/fnorm2.c arm/funder.c ${thumb1_base_SOURCES} ) + set_special_properties(arm/thumb1/cmpdf2.S + SUPERSEDES comparedf2.c PROVIDES comparedf2) + set_special_properties(arm/thumb1/cmpsf2.S + SUPERSEDES comparesf2.c PROVIDES comparesf2) +else() + # Other Thumb1 assembly implementations which do not fall under the + # COMPILER_RT_ARM_OPTIMIZED_FP umbrella + set(thumb1_base_SOURCES + arm/comparesf2.S + ${thumb1_base_SOURCES} + ) endif() set(arm_EABI_RT_SOURCES diff --git a/compiler-rt/lib/builtins/arm/cmpdf2.S b/compiler-rt/lib/builtins/arm/cmpdf2.S new file mode 100644 index 0000000000000..fa6db64e8c1f7 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/cmpdf2.S @@ -0,0 +1,64 @@ +//===-- cmpdf2.S - double-precision floating point comparison -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpdf2: it's a three-way compare +// which returns <0 if x0 if x>y. If the result is +// unordered (i.e. x or y or both is NaN) then it returns >0. +// +// This also makes it suitable for use as all of __eqdf2, __nedf2, __ltdf2 or +// __ledf2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" +#include "crt_endian.h" + + .syntax unified + .text + .p2align 2 + + +op0h .req xh +op0l .req xl +op1h .req yh +op1l .req yl +.macro SetReturnRegister + mov r0, #0 + movhi r0, #1 + movlo r0, #-1 +.endm +.macro SetReturnRegisterNE + movne r0, #-1 + movhi r0, #1 +.endm + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__cmpdf2) + push {r4, lr} + VMOV_FROM_DOUBLE(r0, r1, d0) + VMOV_FROM_DOUBLE(r2, r3, d1) + bl __compiler_rt_softfp_cmpdf2 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpdf2, __compiler_rt_softfp_cmpdf2) +#endif +DEFINE_COMPILERRT_FUNCTION_ALIAS(__ledf2, __cmpdf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltdf2, __cmpdf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqdf2, __cmpdf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__nedf2, __cmpdf2) + +DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpdf2) + #include "dcmp.h" + +LOCAL_LABEL(NaN): + mov r0, #+1 + bx lr + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpdf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/cmpsf2.S b/compiler-rt/lib/builtins/arm/cmpsf2.S new file mode 100644 index 0000000000000..14166246101af --- /dev/null +++ b/compiler-rt/lib/builtins/arm/cmpsf2.S @@ -0,0 +1,56 @@ +//===-- cmpsf2.S - single-precision floating point comparison -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpsf2: it's a three-way compare +// which returns <0 if x0 if x>y. If the result is +// unordered (i.e. x or y or both is NaN) then it returns >0. +// +// This also makes it suitable for use as all of __eqsf2, __nesf2, __ltsf2 or +// __lesf2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + .syntax unified + .text + .p2align 2 + +op0 .req r0 +op1 .req r1 +.macro SetReturnRegister + mov r0, #0 + movhi r0, #1 + movlo r0, #-1 +.endm + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__cmpsf2) + push {r4, lr} + vmov r0, s0 + vmov r1, s1 + bl __compiler_rt_softfp_cmpsf2 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpsf2, __compiler_rt_softfp_cmpsf2) +#endif +DEFINE_COMPILERRT_FUNCTION_ALIAS(__lesf2, __cmpsf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltsf2, __cmpsf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqsf2, __cmpsf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__nesf2, __cmpsf2) + +DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpsf2) + #include "fcmp.h" + +LOCAL_LABEL(NaN): + mov r0, #+1 + bx lr + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpsf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/dcmp.h b/compiler-rt/lib/builtins/arm/dcmp.h new file mode 100644 index 0000000000000..2dd7a36e857f7 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/dcmp.h @@ -0,0 +1,212 @@ +//===-- dcmp.h - shared code for double-precision FP comparison functions -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This code is the skeleton of a double-precision FP compare, with two details +// left out: which input value is in which register, and how to make the return +// value. It allows the main comparison logic to be shared between (for +// example) __ledf2 and __gedf2, varying only those details. +// +//===----------------------------------------------------------------------===// + +// How to use this header file: +// +// This header file is expected to be #included from inside a function +// definition in a .S file. The source file including this header should +// provide the following: +// +// op0h, op0l, op1h, op1l: register aliases (via .req) for the registers +// containing the input operands. +// - For most comparisons, op0h,op0l will correspond to xh,xl, and op1h,op1l +// to yh,yl (as defined in turn in crt_endian.h). +// - But a function with the reversed semantics of __aeabi_cdrcmple wil define +// them the other way round. +// +// SetReturnRegister: an assembly macro that looks at the PSR flags and sets up +// an appropriate return value in r0, for the cases that do *not* involve NaN. +// - On entry to this macro, the condition codes LO, EQ and HI indicate that +// op0 < op1, op0 == op1 or op0 > op1 respectively. +// - For functions that return a result in the flags, this macro can be empty, +// because those are the correct flags to return anyway. +// - Functions that return a boolean in r0 should set it up by checking the +// flags. +// +// SetReturnRegisterNE: a macro that does the same thing as SetReturnRegister, +// except that if the Z flag is set, it instead does nothing at all. (This +// macro must not assume that the flags were set by a single CMP: in +// particular, C=0 but Z=1 is possible on entry to this macro, so you must not +// use the LO condition code and assume it is mutually exclusive with EQ.) +// +// LOCAL_LABEL(NaN): a label defined within the compare function, after the +// #include of this header. Called when at least one input is a NaN, and sets +// up the appropriate return value for that case. + +// -------------------------------------------------- +// The actual entry point of the compare function. +// +// The basic plan is to start by ORing together the two inputs. This tells us +// two things: +// - the top bit of the output tells us whether both inputs are positive, or +// whether at least one is negative +// - if the 11 exponent bits of the output are not all 1, then there are +// definitely no NaNs, so a fast path can handle most non-NaN cases. + +// clang-format off + + // First diverge control for the negative-numbers case. + orrs r12, op0h, op1h + bmi LOCAL_LABEL(negative) // high bit set => at least one negative input + + // Here, both inputs are positive. Try adding 1<<20 to their bitwise OR in + // r12. This will carry all the way into the top bit, setting the N flag, if + // all 11 exponent bits were set. + cmn r12, #1 << 20 + bmi LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs + + // The fastest fast path: both inputs positive and we could easily tell there + // were no NaNs. So we just compare op0 and op1 as unsigned integers. + cmp op0h, op1h + SetReturnRegisterNE + bxne lr + cmp op0l, op1l + SetReturnRegister + bx lr + +LOCAL_LABEL(NaNInf_check_positive): + // Second tier for positive numbers. We come here if both inputs are + // positive, but our fast initial check didn't manage to rule out a NaN. But + // it's not guaranteed that there _is_ a NaN, for two reasons: + // + // 1. An input with exponent 0x7FF might be an infinity instead. Those + // behave normally under comparison. + // + // 2. There might not even _be_ an input with exponent 0x7FF. All we know so + // far is that the two inputs ORed together had all the exponent bits + // set. So each of those bits is set in _at least one_ of the inputs, but + // not necessarily all in the _same_ input. + // + // Test each exponent individually for 0x7FF, using the same CMN idiom as + // above. If neither one carries into the sign bit then we have no NaNs _or_ + // infinities and can compare the registers and return again. + cmn op0h, #1 << 20 + cmnpl op1h, #1 << 20 + bmi LOCAL_LABEL(NaN_check_positive) + + // Second-tier return path, now we've ruled out anything difficult. By this + // time we know that the two operands have different exponents (because the + // exponents' bitwise OR is 0x7FF but neither one is 0x7FF by itself, so each + // must have a set bit not present in the other). So we only need to compare + // the high words. + cmp op0h, op1h + SetReturnRegister + bx lr + +LOCAL_LABEL(NaN_check_positive): + // Third tier for positive numbers. Here we know that at least one of the + // inputs has exponent 0x7FF. But they might still be infinities rather than + // NaNs. So now we must check whether there's an actual NaN. + // + // We do this by shifting the high word of each input left to get rid of the + // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in + // the low word. Then we check if the result is _greater_ than 0xFFE00000 + // (but not equal), via adding 0x00200000 to it and testing for the HI + // condition (carry flag set, but Z clear). + // + // We could have skipped the second-tier check and done this more rigorous + // test immediately. But that would cost an extra instruction in the case + // where there are no infinities or NaNs, and we assume that that is so much + // more common that it's worth optimizing for. + cmp op0l, #1 // set C if op0l is nonzero + adc op0h, op0h, op0h // shift op0h left, bringing in the C bit + cmp op1l, #1 // set C if op1l is nonzero + adc op1h, op1h, op1h // shift op1h left, bringing in the C bit + cmn op0h, #1 << 21 // if HI, then op0 is a NaN + cmnls op1h, #1 << 21 // if not HI, then do the same check for op1 + bhi LOCAL_LABEL(NaN) // now, if HI, there's definitely a NaN + + // Now we've finally ruled out NaNs! And we still know both inputs are + // positive. So the third-tier return path can just compare the top words + // again. (The fact that we've just shifted them left doesn't make a + // difference.) + cmp op0h, op1h + SetReturnRegister + bx lr + +LOCAL_LABEL(negative): + // We come here if at least one operand is negative. We haven't checked for + // NaNs at all yet (the sign check came first), so repeat the first-tier + // check strategy of seeing if all exponent bits are set in r12. + // + // On this path, the sign bit in r12 is set, so if adding 1 to the low + // exponent bit carries all the way through into the sign bit, it will + // _clear_ the sign bit rather than setting it. So we expect MI to be the + // "definitely no NaNs" result, where it was PL on the positive branch. + cmn r12, #1 << 20 + bpl LOCAL_LABEL(NaNInf_check_negative) + + // Now we have no NaNs, but at least one negative number. This gives us two + // complications: + // + // 1. Floating-point numbers are sign/magnitude, not two's complement, so we + // have to consider separately the cases of "both negative" and "one of + // each sign". + // + // 2. -0 and +0 are required to compare equal. + // + // But problem #1 is not as hard as it sounds! If both operands are negative, + // then we can get the result we want by comparing them as unsigned integers + // the opposite way round, because the input with the smaller value (as an + // integer) is the larger number in an FP ordering sense. And if one operand + // is negative and the other is positive, the _same_ reversed comparison + // works, because the positive number (with zero sign bit) will always + // compare less than the negative one in an unsigned-integers sense. + // + // So we only have to worry about problem #2, signed zeroes. This only + // affects the answer if _both_ operands are zero. So we check that by + // testing all bits of both operands apart from the sign bit. + orrs r12, op0l, op0h, LSL #1 // EQ if op0 is zero + orrseq r12, op1l, op1h, LSL #1 // now only EQ if both are zero + cmpne op1h, op0h // otherwise, compare them backwards + SetReturnRegisterNE + bxne lr + cmp op1l, op0l + SetReturnRegister + bx lr + +LOCAL_LABEL(NaNInf_check_negative): + // Second tier for negative numbers: we know the OR of the exponents is 0xFF, + // but again, we might not have either _actual_ exponent 0xFF, and also, an + // exponent 0xFF might be an infinity instead of a NaN. + // + // On this path we've already branched twice (once for negative numbers and + // once for the first-tier NaN check), so we'll just go straight to the + // precise check for NaNs. + // + // Like the NaNInf_check_positive case, we do each NaN check by making a + // word consisting of (high word << 1) OR (1 if low word is nonzero). But + // unlike the positive case, we can't make those words _in place_, + // overwriting op0h and op1h themselves, because that would shift the sign + // bits off the top, and we still need the sign bits to get the comparison + // right. (In the positive case, we knew both sign bits were 0, enabling a + // shortcut.) + cmp op0l, #1 // set C if op0l is nonzero + adc r12, op0h, op0h // shift op0h left, bringing in the C bit + cmn r12, #1 << 21 // if HI, then op0 is a NaN + bhi LOCAL_LABEL(NaN) + cmp op1l, #1 // set C if op1l is nonzero + adc r12, op1h, op1h // shift op1h left, bringing in the C bit + cmn r12, #1 << 21 // if HI, then op1 is a NaN + bhi LOCAL_LABEL(NaN) + + // Now we've ruled out NaNs, so we can just compare the two input registers + // and return. On this path we _don't_ need to check for the special case of + // comparing two zeroes, because we only came here if the bitwise OR of the + // exponent fields was 0x7FF, which means the exponents can't both have been + // zero! So we can _just_ do the reversed CMP and finish. + cmp op1h, op0h + SetReturnRegister + bx lr diff --git a/compiler-rt/lib/builtins/arm/divdf3.S b/compiler-rt/lib/builtins/arm/divdf3.S new file mode 100644 index 0000000000000..58a9e2690efd3 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/divdf3.S @@ -0,0 +1,646 @@ +//===-- divdf3.S - double-precision floating point division ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the __divdf3 function (double precision floating point +// division), with the IEEE-754 default rounding (to nearest, ties to even), +// for the Arm and Thumb2 ISAs. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" +#include "crt_endian.h" + +// The basic strategy of this division code is to use Newton-Raphson iteration +// to calculate an approximation to 1/y, then multiply it by x. This procedure +// delivers a quotient with 10 extra bits of precision, but which isn't exact. +// We know an upper bound on its possible error, which gives an interval of +// possible values for the true quotient. So we can check the 10 extra bits to +// see whether a rounding boundary lies within the interval. If not, then we +// can round and return without worrying further; otherwise, we go to slower +// correction code that multiplies the approximate quotient back up by y and +// checks it against x. +// +// This strategy depends critically on the upper bound on the approximation +// error. Underestimating the error introduces a bug; overestimating it costs +// performance, by sending more cases than necessary to the slow path. +// +// To give high confidence of its correctness, the upper bound has been proved +// formally by Gappa. The Gappa proof and auxiliary code are not included in +// this version, but they can be found in the Arm Optimized Routines repository +// +// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/at32/ddiv.S +// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-prove.py +// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-diagnostics.c +// +// and a pair of blog posts describing the concepts and procedure are here: +// +// https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p1 +// https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p2 + + .syntax unified + .text + .p2align 2 + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__divdf3) + push {r4, lr} + VMOV_FROM_DOUBLE(r0, r1, d0) + VMOV_FROM_DOUBLE(r2, r3, d1) + bl __aeabi_ddiv + VMOV_TO_DOUBLE(d0, r0, r1) + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__divdf3, __aeabi_ddiv) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_ddiv) + + push {r4,r5,r6,r7,r8,lr} + + // Check if either input exponent 7FF (infinity or NaN), and if so, branch + // out of line. + ldr r12, =0x07FF0000 // mask for exponent cold storage + bics r4, r12, xh, lsr #4 // test for Infs or NaNs + bicsne r4, r12, yh, lsr #4 + beq LOCAL_LABEL(ddiv_naninf) + + // Extract the exponents of the input values x and y into bits 16..26 of r14 + // and r5 respectively, and in the process, check if either exponent is zero + // (so that one or both inputs are 0 or denormal). In order to combine the + // two tests, the second ANDS is performed conditionally, so that if x's + // exponent is zero then the out-of-line code at ddiv_zerodenorm might find + // y's exponent hasn't been set up yet. + // + // We also calculate the sign of the result, which will be needed whether or + // not we branch. This is saved in the low bit of r4. + ands r4, r12, xh, lsr #4 // get exponent of x, setting Z if it's 0 + andsne r5, r12, yh, lsr #4 // if not, extract and test exponent of y + eor r6, xh, yh // XOR the input signs to get the result sign + orr r4, r4, r6, lsr #31 // save it in the low bit of r4 + beq LOCAL_LABEL(ddiv_zerodenorm) // branch out of line for zeroes or denormals + + // Calculate the initial exponent of the result, by subtracting the two input + // exponents and adjusting for the IEEE exponent bias. This value may have to + // be adjusted by 1 later, depending on the quotient of the mantissas. + // + // If we branched to ddiv_zerodenorm above, and it found denormals but no + // zeroes, it may branch back here after renormalising them. We expect the + // out-of-line code to have left the exponent difference in the top half of + // r4 (still with the output sign in the low bit), but not yet to have + // applied the bias. So it branches back in immediately after the SUB. + // + // The exponent bias we want is either 0x3fe or 0x3ff, depending on whether + // we have to shift the output mantissa by 1 below. Neither of those values + // fits in the immediate field of an ADD instruction, so we must use two + // instructions. + sub r4, r4, r5 +LOCAL_LABEL(ddiv_normalised): // denormal handler will come back to here + add r4, r4, #0x03FC0000 // add the 8 high bits of the bias 0x3FE + add r4, r4, #0x00020000 // add the remaining bit of the bias + + // Shift both mantissas up to the top of their 64-bit register pair, and OR + // in the leading 1 bit, which will occupy the high bit of the high word in + // each case. + mov r5, #(1 << 31) // high bit for ORing in to both mantissas + orr xh, r5, xh, lsl #11 // shift up xh and OR in the high bit + orr yh, r5, yh, lsl #11 // same for yh + orr xh, xh, xl, lsr #21 // OR in the bits shifted out of xl into xh + orr yh, yh, yl, lsr #21 // same for yl and yh + lsl xl, xl, #11 // shift up the rest of xl + lsl yl, yl, #11 // same for yl + + // Check if the two mantissas are exactly equal, so that the quotient is + // exactly a power of 2. If so, branch out of line to handle that case + // specially. + // + // This guarantees that when we examine the approximate quotient afterwards, + // we can't be confused about whether it needs to be renormalised, which + // would otherwise cost just as much effort as this check. Our reciprocal + // approximation is always an underestimate (that's in the nature of this + // particular Newton-Raphson iteration), so if x < y (meaning the mantissas + // rather than the whole floats) then even the true quotient will be less + // than 1, and the approximation even more so. On the other hand, if x > y, + // then the true quotient will be enough greater than 1 that even the largest + // possible error in the approximation can't make it look like less than 1. + // + // (Proof: regard x,y as normalised to the range [1,2). If x > y, then we + // have x ≥ y+ε, where ε is the machine epsilon. So x/y ≥ 1+ε/y > 1+ε/2. And + // the bound on the approximation error, given below, is far less than ε/2.) + cmp xh, yh + cmpeq xl, yl + beq LOCAL_LABEL(ddiv_result_is_power_of_2) + + // Now we begin the actual calculation of the reciprocal approximation. + // + // We begin with our two input mantissas stored in xh:xl and yh:yl, each with + // its leading 1 explicit and shifted up to the top of the word. So they can + // be regarded as 64-bit integers with the high bit set and the bottom 11 + // bits clear. + + // Obtain an 8-bit reciprocal approximation by using the topmost 8 bits of y + // as a lookup table. The top bit of y is always set, so there are only 128 + // lookup table entries, not 256. The 8-bit value we load also has its top + // bit set. + lsr r5, yh, #24 // r5 is the table index plus 0x80 + + // Get the address of reciptbl, in various ways depending on position- + // independence and Arm/Thumb state. + // + // Since the table index calculated above in r5 includes the high mantissa + // bit, an index of 0x80 refers to the first table entry and 0xFF the last. + // So we subtract 0x80 from the table address to compensate. +#if defined __pic__ || defined __PIC__ || defined __ARM_ROPI + // In PIC or ROPI modes, we must construct the address in a pc-relative + // manner, by making a literal containing the offset from the current code. + // The reference point for that offset is the value of pc as read by the add + // instruction at get_reciptbl below, which will be 4 or 8 bytes after it in + // Thumb or Arm state respectively. +#if __thumb__ + ldr r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+4) +#else + ldr r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+8) +#endif +LOCAL_LABEL(get_reciptbl): + add r6, r6, pc +#else + // If we're not building for position independence, we can just load the + // target address directly. + ldr r6, =(LOCAL_LABEL(reciptbl)-0x80) +#endif + + ldrb r6, [r6, r5] // and load the approximation into r6 + + // First Newton-Raphson iteration, which expands that 8-bit approximation to + // a 17-bit one, again with its top bit set. We use the top 16 bits of y for + // this, so that we can fit the multiplications into ordinary MUL rather than + // UMULL. + // + // The Newton-Raphson formula to turn an approximation r ≈ 1/y into a better + // one is r → r(2-yr). In this case we're scaling up to integers (informal + // fixed point), so the 2 becomes 2^24. + lsr r5, yh, #16 // get top halfword of y + mul r7, r6, r5 // multiply it by the input value r + rsb r7, r7, #(1 << 24) // subtract from 2 (scaled up appropriately) + mul r7, r6, r7 // multiply again to make r(2-yr) + lsr r7, r7, #14 // shift down to keep only 17 bits of it + + // Second iteration, expanding into a 32-bit reciprocal, using the top 31 + // bits of y (i.e. yh shifted by 1). The first multiplication (making yr) is + // 32x32 → 64 bits, so we use a single UMULL; the second one making r(2-yr) + // is 32x64, which we do with a UMULL by the bottom half of yr and then MLA + // by the top half, so we only keep the low 64 bits of the full answer. + // + // The subtraction from 2 (again scaled up, this time to 2^48) is done by + // RSBS+RSC, interleaved with the multiplications so as to use a delay slot + // on CPUs that have one. + lsr r12, yh, #1 + umull r6, r8, r7, r12 // r8:r6 = yr + rsbs r6, r6, #0 // low half of subtraction from 2 + umull r12, lr, r7, r6 // multiply r by the low half of 2-yr +#if __thumb__ + // Thumb has no RSC, so simulate it by bitwise inversion and then ADC + mvn r8, r8 + adc r8, r8, #(1 << 16) +#else + rsc r8, r8, #(1 << 16) // high half of subtraction from 2 +#endif + mla r6, r7, r8, lr // multiply r by the high half of 2-yr + + // Third iteration, expanding into a 64-bit reciprocal, with the leading bit + // expected to end up in bit 60. Now the first multiplication to make yr is + // 32x64 → 96 bits, so we put the product in three registers lr:r12:r8. + // However, we're going to discard the low word r8 completely, because it + // makes negligible difference. So we'll treat the output yr as 64-bit. + umull r8, r12, r6, yl // multiply r by bottom half of y + mov lr, #0 // initialize high word to 0 + umlal r12, lr, r6, yh // multiply r by top half of y + // Subtract from a power of 2, as usual. But in this case the power of 2 + // we're subtracting from is 2^64, which is just off the top of the 64-bit + // value in lr:r12. So in fact we're just negating the whole thing! + // + // To preserve the invariant that the approximation error is always negative, + // we negate via one's complement rather than two's. (This would only make a + // difference if r8 had happened to be exactly 0. That in turn can occur when + // yl=0, so one of the test cases in ddiv-diagnostics.c deliberately uses + // such a value, so that the intermediate results can be checked against the + // reference Python.) + mvn r12, r12 + mvn lr, lr + // Now lr:r12:r8 contains 2-yr. We discard the low word r8 to reduce that to + // 64 bits, and do another 32x64 → 96 bit multiplication. + umull r5, r8, r6, r12 // multiply r by bottom half of 2-yr + mov r7, #0 // initialize high word to 0 + umlal r8, r7, r6, lr // multiply r by top half of 2-yr + + // That's the Newton-Raphson iteration done: we have a 64-bit approximation + // to 1/y. Multiply it by x to get the full approximate quotient. + // + // In principle, this would be a 64x64 → 128 bit multiplication, involving + // four long multiply instructions. But we only need the top 64 bits, and + // we're already prepared to tolerate some error in the calculations, so we + // cut corners: don't multiply the two low words together at all, and we + // discard the bottom half of each of the (low * high) partial products + // without bothering to propagate carries out of it. + // + // (All of these shortcuts are faithfully mimicked in the Python reference + // implementation which generates Gappa input, so they're all accounted for + // in the error analysis.) +#if __ARM_FEATURE_DSP + umull r12, r6, xh, r8 // r6 = high word of x * low word of 1/y + umull r12, r5, xl, r7 // r5 = low word of x * high word of 1/y + umaal r6, r5, xh, r7 // add those to the product of both high words +#else + // Alternative instruction sequence using UMLAL, if UMAAL isn't available + umull r12, r6, xh, r8 // r6 = high word of x * low word of 1/y + umull r12, lr, xl, r7 // lr = low word of x * high word of 1/y + adds r6, r6, lr // add those together + mov r5, #0 // set r5 to the carry out of that addition + adc r5, r5, #0 + umlal r6, r5, xh, r7 // add that to the product of both high words +#endif + // Now r5:r6 is the completed approximate quotient, with its leading bit at + // position either 61 or 62. + + // Normalize so that the leading bit is always in bit 60, by shifting left if + // it isn't there already, and adjusting the output exponent by 1 to + // compensate. + // + // We do the test in a slightly tricky way, by arranging to set the V flag if + // the leading bit is in bit 60. This allows us to do the left shift under + // the VC condition, which is convenient because the LSLS instruction that + // shifts the low word left moves the top bit into the C flag without + // affecting V. + // + // We also save the value written into lr by the initial ADDS instruction, + // because that contains enough information to tell us whether we + // renormalised here. The correction path for quotients too close to a + // rounding boundary will need to recover that information. + adds lr, r5, #0x40000000 // set V flag if bit 62 of high word set + subvc r4, r4, #(1 << 16) // if not, correct the exponent by 1, + lslsvc r6, r6, #1 // shift the low word of the quotient left + adcvc r5, r5, r5 // and shift its top bit into the high word + + // Now r5:r6 is the _normalised_ approximate quotient, with its leading bit + // reliably in bit 60. This is the final output of the calculation that the + // Gappa error-analysis proof applies to. + + // That 64-bit output has bit 63 clear; the leading 1 bit of the output + // mantissa in bit 62, followed by 52 more mantissa bits; then 10 bits at the + // bottom which are used for determining rounding. + // + // Compute the _approximately_ rounded-to-nearest output mantissa, by adding + // half a ULP and shifting down. If we don't go to the slow path, this is the + // correct output mantissa. (See fdiv.S for the proof that the round-to-even + // tiebreaking case can't occur in floating-point division.) + // + // We keep the original version of r6, containing the ten rounding bits, so + // that we can test it to see if we need the slow path. + adds r7, r6, #(1 << 9) // add half a ULP, copying low word into r7 + adc r5, r5, #0 // propagate carry into high word + lsr r7, r7, #10 // shift low word right + orr r7, r7, r5, lsl #22 // combine with bits shifted out of high word + lsr r5, r5, #10 // shift high word right + + // Now test r6 to see whether this output mantissa can be relied on, or + // whether the approximation landed too close to a rounding boundary. + // + // The maximum possible error in the approximation, taking into account the + // initial error in each lookup table entry, the remaining mathematical error + // introduced by stopping after this many Newton-Raphson iterations, and + // every shortcut, right shift, truncation and discarding of a partial + // product in the algorithm above, is always negative, and less than 64 units + // in the last place of the 64-bit approximate quotient. That is, the true + // quotient lies somewhere between the 64-bit integer described as "final + // output of the calculation" above, and that plus 64. + // + // So if the bottom 10 bits of r6 have the value 2^9 or greater, we're safe, + // because the true value is _larger_ than the approximation, so if the + // approximation is already above the rounding boundary then so is the true + // value. And if those 10 bits are (2^9-64) or less then we're also safe, + // because even if the true value is greater by 63, it's still on the same + // side of the rounding boundary. + // + // We check the error by subtracting (2^9-64), so that the dangerous values + // of the bottom 10 bits are those in the range 0,...,63, i.e. precisely + // those with none of bits 6,7,8,9 set. + // + // We also combine this test with a check for underflow, because that also + // needs more careful handling (the mantissa must be re-rounded to a + // different bit position, which involves knowing whether it's exact). + // Underflow has happened if the exponent in the top half of r4 is negative + // (it's off by 1 so that the leading mantissa bit will increment it), so we + // test by an ASR#31 (copying the top bit of r4 into all of it) and negating. + // That way, the output value is zero on underflow, matching the flags from + // the other check. + sub r6, r6, #(1 << 9)-64 + tst r6, #0x3C0 // now EQ means we must go to the slow path + mvnsne r12, r4, asr #31 // also set EQ if underflow has happened + beq LOCAL_LABEL(ddiv_correction) // branch out of line to do the hard bit + + // If we do go to ddiv_correction, it branches back here after the correction + // code has finished. Either way, we expect that r5:r7 is the result + // mantissa, with the top bit set, already in the correct position in the + // word, and already rounded to nearest. +LOCAL_LABEL(ddiv_corrected): + // Recombine the output mantissa with the sign and exponent. + add xh, r5, r4, lsl #31 // add sign bit to top word of mantissa + bic r12, r4, #1 // isolate exponent in top half of r4 + add xh, xh, r12, lsl #4 // add exponent to make the final high word + mov xl, r7 // move low word into the right register + + // If there's no overflow or underflow, we're done. + // + // We _identified_ underflow above when we went to the slow path, but having + // done that, the slow path came back here, so we must check for it again. + // (The only purpose of the detour was to obtain accurate information about + // whether the quotient is exact, or needed rounding.) + // + // The output exponent, offset downwards by 1, is in the top half of r4. If + // it's negative, there's an underflow; if it's too large, there's an + // overflow. We do an approximate test for both at once via an unsigned + // comparison against 0x7f0, using r12 (the register in which we already + // cleared the sign bit stored at the bottom). This identifies _most_ normal + // outputs as quickly as possible. + // + // 0x7f0 isn't the maximum possible known-safe exponent, but it's the largest + // one that fits in the immediate field of CMP. We deal with the remaining + // cases in the next few instructions. + cmp r12, #(0x7f0 << 16) + popls {r4,r5,r6,r7,r8,pc} + + // Now check the remaining cases more carefully. + // + // If r12 < 0 then we definitely have underflow. We detect overflow precisely + // by seeing if the _final_ output exponent (in the output register xh) is + // 0x7ff or more, by incrementing it and seeing if the sign is opposite from + // the intended output sign. + add lr, xh, #(1 << 20) // increment the output exponent field + teq lr, r4, lsl #31 // set N if the sign now doesn't match r4[0] + tstpl r12, r12 // otherwise, set N if underflow + poppl {r4,r5,r6,r7,r8,pc} // if neither, we've finished + + // If we still haven't returned, we really do have overflow or underflow, and + // the sign of r12 tells us which. + tst r12, r12 + bmi LOCAL_LABEL(ddiv_underflow) + // For overflow, correct the sign by biasing the exponent downward, and go to + // code that constructs an infinite return value (shared with the + // division-by-zero handler). + sub xh, xh, #0x60000000 + pop {r4,r5,r6,r7,r8,lr} // ddiv_retinf expects no regs on the stack + b LOCAL_LABEL(ddiv_retinf) + +LOCAL_LABEL(ddiv_correction): + // The slow path, entered if the approximate quotient was too close to a + // rounding boundary to trust, and also if there's a chance of underflow (so + // that we can reliably determine the rounding direction, including whether + // the quotient was exact). + // + // Regarding the input mantissas x,y and our approximate quotient q as + // integers in [2^52,2^53), the quotient is an approximation to either + // x*2^52/y or x*2^53/y, depending on which of x,y was larger. We know that q + // is less than the true value of that quotient by at most a small fraction + // of a ULP. So the correct rounded quotient is either equal to q or to q+1, + // and we can decide which by multiplying back up by y: we want q - x*2^k/y + // to be in the range (-1/2,+1/2) (where k = 52 or 53), which is equivalent + // to asking if qy - x*2^k is in the range (-y/2,+y/2). + // + // That's a calculation we can do in integers using only addition and + // multiplication. And we know that if q itself doesn't have that property + // then q+1 will. + + // The mantissa of y is currently right at the top of the word, which means + // that if the result of our check is greater than it, it will overflow. So + // we must start by shifting y downward. We'll put it back at the bottom of + // the word, where it was in the input float. + lsr yl, yl, #11 // shift yl right + orr yl, yl, yh, lsl #21 // OR in the bits shifted out of yh + lsr yh, yh, #11 // shift yh right + + // Compute the integer qy-x. Because q is already very close to the right + // quotient, we expect this to be an integer at most twice the size of y, + // which easily fits in 64 bits. So we don't need to compute the full 128-bit + // product: the low 64 bits are enough. + umull r8, r6, r7, yl // 64-bit product of the low words + mla r6, r7, yh, r6 // + (high word of y) * (low word of q) + mla r6, r5, yl, r6 // + (high word of q) * (low word of y) + + // Now we must subtract either x << 53 or x << 52. This will only affect the + // high word of the product we've just computed. Also the mantissa of x is + // already shifted left by 11. So we shift xl left by either (52-32-11) or + // (53-32-11), i.e. by 9 or by 10, and subtract from the high word of the + // product. + // + // To decide which, we consult the value left in lr by the original test for + // renormalization, which added 0x40000000 to the high word of the initial + // approximate quotient 'quot'. If that had bit 62 set (so no renormalization + // needed) then the addition carried into the sign bit; otherwise it didn't. + // So lr is positive if and only if we need to shift xl left by an extra bit. + tst lr, lr // did we renormalize? + subpl r6, r6, xl, lsl #10 // if so, subtract x<<53 from q*y + submi r6, r6, xl, lsl #9 // if not, subtract x<<52 + + // Now r6:r8 contains the residual value r = qy - x*2^k as described above. + // If this is between -y/2 and +y/2 then q is already the correctly rounded + // quotient. Otherwise, the correct quotient is q+1, so the value in r6:r8 + // will be too small (incrementing q would add y to it). So we need to check + // whether r < -y/2, or equivalently whether 2r < -y (avoiding having to + // worry about what happens when we halve y if it's odd). + // + // As mentioned above, division can't give an exact halfway case, so we don't + // need to worry about the case r = y/2. + adds r8, r8, r8 // multiply the residual by 2 + adc r6, r6, r6 + adds lr, r8, yl // add y to it, discarding the result + adcs lr, r6, yh + bpl LOCAL_LABEL(ddiv_corrected) // if the answer is positive, we're OK + + // If we didn't take that branch, then the approximate quotient is too small + // by 1, so we must increment it. But also, we adjust the residual in r6:r8 + // to match. That residual is unused by the main epilogue code, but we also + // came here for any underflowing value, and the underflow handler will need + // the exact residual to determine the rounding direction. + // + // (We could re-test whether underflow had happened and use that to skip the + // update of r6:r8, but the test would cost as much effort as it saved!) + adds r7, r7, #1 // increment the output quotient + adcs r5, r5, #0 + adds r8, r8, yl // repeat the addition of y to the residual, + adcs r6, r6, yh // this time keeping the result in r6:r8 + b LOCAL_LABEL(ddiv_corrected) // finally we can rejoin the main code + +LOCAL_LABEL(ddiv_result_is_power_of_2): + // The special-case handler for the two input mantissas being equal, so that + // the result is an exact power of two. We set up all the output registers to + // the way the main code would have done it, and jump straight to + // ddiv_corrected. This includes setting r6:r8 to the 'residual' value + // computed by the slow path, in case this power-of-2 output is also an + // underflow, which will depend on those registers. + mov r5, #0x00100000 // high word of quotient mantissa = 1<<20 + mov r7, #0 // low word of quotient mantissa = 0 + mov r6, #0 // high word of residual = 0 + mov r8, #0 // low word of residual = 0 + b LOCAL_LABEL(ddiv_corrected) + +LOCAL_LABEL(ddiv_underflow): + // We come here to handle underflow. The output double, constructed naïvely + // from the out-of-range exponent, is in xh:xl. We expect in this situation + // that we've _always_ come via either the ddiv_correction slow path or the + // ddiv_result_is_power_of_2 special case, both of which will have set up a + // residual value in r6:r8 equal to q*y - x*2^k (for appropriate k). This + // value is positive if the quotient is slightly above the true value (i.e. + // was rounded up), or negative if the quotient was rounded down. But we must + // also distinguish the third case of the residual being exactly zero. + add xh, xh, #0x60000000 // apply IEEE 754 exponent bias for __dunder + orrs r12, r6, r8 // set r12=0 and Z=1 if quotient was exact + movne r12, #1 // otherwise, set r12 = +1 + orrne r12, r12, r6, asr #31 // and change to -1 if residual is negative + pop {r4,r5,r6,r7,r8,lr} // pop all locally saved registers + b SYMBOL_NAME(__compiler_rt_dunder) // and tailcall __dunder to finish + +LOCAL_LABEL(ddiv_zerodenorm): + // We come here if either input had exponent 0, so there's at least one zero + // or denormal. However, we know there are no infinities or NaNs, because + // those were checked first and will have gone to ddiv_naninf below. + // + // First we must repeat the instruction which extracted the exponent of y + // into r5, this time unconditionally, in case the setup code didn't do it. + and r5, r12, yh, lsr #4 + + // If either or both input is actually zero, the answer is easy. + orrs lr, xl, xh, lsl #1 // is x zero? + beq LOCAL_LABEL(ddiv_xzero) + orrs lr, yl, yh, lsl #1 // is y zero? + beq LOCAL_LABEL(ddiv_divbyzero) + + // Otherwise, delegate to __dnorm2 to handle denormals, converting them into + // a normalised mantissa and an out-of-range exponent. __dnorm2 expects the + // exponents at the bottom of their words instead of half way up, so shift + // down first. + lsr r4, r4, #16 + lsr r5, r5, #16 + push {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack + mov r0, sp // pass it by address + bl SYMBOL_NAME(__compiler_rt_dnorm2) + pop {r0, r1, r2, r3, r4, r5} + + // Rejoin the main code, with the exponent difference in the top half of r4, + // and the output sign in the low bit of r4. (The original setup code did the + // latter, but we clobbered it while setting up for __dnorm2.) + subs r4, r4, r5 // exponent difference, at the bottom of r4 + lsls r4, r4, #16 // move it up to the right place + orr r4, r4, r6, lsr #31 // recover output sign from top bit of r6 + b LOCAL_LABEL(ddiv_normalised) // rejoin the main code + +LOCAL_LABEL(ddiv_xzero): + // We come here if x=0. We return 0 (of the right sign) if y is not 0, and + // the default quiet NaN if both inputs are zero. + orrs lr, yl, yh, lsl #1 // is y zero? + beq LOCAL_LABEL(ddiv_ivo_pop) // if so, pop registers and return a NaN + // We know xl=0 already, so we only need to reset xh to contain the right + // output sign. The setup code left that in the high bit of r6. + and xh, r6, #0x80000000 + pop {r4,r5,r6,r7,r8,pc} + +LOCAL_LABEL(ddiv_divbyzero): + // We come here if y=0, but x is not 0 (or we'd have gone to ddiv_xzero above + // instead). So we're dividing a nonzero number by zero, and must return + // infinity. + pop {r4,r5,r6,r7,r8,lr} + eor xh, xh, yh // combine signs to get result sign + b LOCAL_LABEL(ddiv_retinf) + +LOCAL_LABEL(ddiv_naninf): + // We come here knowing that at least one operand is either NaN or infinity. + // If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our + // stacked registers first: we won't need that much spare space any more, and + // it makes the tailcall easier if we've already done it. + pop {r4,r5,r6,r7,r8,lr} + + // A number is a NaN if its exponent is 0x7ff and at least one bit below that + // is set. The CMP + ADC pair here converts the two words xh:xl into a single + // word containing xh shifted up by one (throwing away the sign bit which + // makes no difference), with its low bit set if xl was nonzero. So if that + // is strictly greater than 0xffe00000, then x was a NaN. + cmp xl, #1 + adc r12, xh, xh + cmp r12, #0xFFE00000 + bhi SYMBOL_NAME(__compiler_rt_dnan2) + // Now check y in the same way. + cmp yl, #1 + adc r12, yh, yh + cmp r12, #0xFFE00000 + bhi SYMBOL_NAME(__compiler_rt_dnan2) + + // Now we know there are no NaNs. Therefore there's at least one infinity. If + // both operands are infinity then we have inf / inf = invalid operation and + // must return a NaN. We detect this by XORing the inputs' exponent fields: + // knowing one of them is 7FF, they XOR to zero iff the other one is too. + eors r12, xh, yh // XOR entire top words of the inputs + lsl r12, r12, #1 // shift left to discard the sign bit + lsrs r12, r12, #21 // shift right again to discard mantissas + beq LOCAL_LABEL(ddiv_ivo) // if what's left is 0, we have inf / inf + + // Otherwise, there's exactly one infinity, so our answers are easy, but + // depend on which operand it is: + // infinity / anything = infinity + // anything / infinity = 0 + // + // Determine if x is the infinity, by bitwise inverting the whole word and + // then shifting left and right to isolate its exponent bits. + mvn r12, xh, lsl #1 // invert x, shift left to discard sign + lsrs r12, r12, #21 // and shift right to discard mantissa + eor xh, xh, yh // calculate the output sign bit + beq LOCAL_LABEL(ddiv_retinf) // if x = inf, return infinity of that sign + mov xl, #0 // otherwise clear all bits of x + and xh, xh, #0x80000000 // other than the sign bit + bx lr // and return zero of the same sign +LOCAL_LABEL(ddiv_retinf): + // Construct and return an infinity in xh:xl, with whatever sign bit is + // already in the top bit of xh. + mov xl, #0 // clear low word + mvn xh, xh, lsr #31 // shift xh[31] down to bit 0, inverted + mvn xh, xh, lsl #11 // uninvert, and put exponent 0x7ff below it + lsl xh, xh, #20 // shift back up to the top + bx lr + + // Code to construct and return the default quiet NaN, for the cases inf/inf + // and 0/0. We provide two entry labels, one for callers who still need to + // pop all the registers this function pushed, and one for callers who have + // done that already. +LOCAL_LABEL(ddiv_ivo_pop): + pop {r4,r5,r6,r7,r8,lr} +LOCAL_LABEL(ddiv_ivo): + movw xh, 0x7ff8 + lsls xh, xh, #16 + mov xl, #0 + bx lr + +END_COMPILERRT_FUNCTION(__aeabi_ddiv) + + // Table of approximate reciprocals. + .rodata +LOCAL_LABEL(reciptbl): + .byte 0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF4,0xF2 + .byte 0xF0,0xEE,0xED,0xEB,0xE9,0xE8,0xE6,0xE4 + .byte 0xE3,0xE1,0xE0,0xDE,0xDD,0xDB,0xDA,0xD8 + .byte 0xD7,0xD5,0xD4,0xD3,0xD1,0xD0,0xCF,0xCD + .byte 0xCC,0xCB,0xCA,0xC8,0xC7,0xC6,0xC5,0xC4 + .byte 0xC2,0xC1,0xC0,0xBF,0xBE,0xBD,0xBC,0xBB + .byte 0xBA,0xB9,0xB8,0xB7,0xB6,0xB5,0xB4,0xB3 + .byte 0xB2,0xB1,0xB0,0xAF,0xAE,0xAD,0xAC,0xAB + .byte 0xAA,0xA9,0xA8,0xA8,0xA7,0xA6,0xA5,0xA4 + .byte 0xA3,0xA3,0xA2,0xA1,0xA0,0x9F,0x9F,0x9E + .byte 0x9D,0x9C,0x9C,0x9B,0x9A,0x99,0x99,0x98 + .byte 0x97,0x97,0x96,0x95,0x95,0x94,0x93,0x93 + .byte 0x92,0x91,0x91,0x90,0x8F,0x8F,0x8E,0x8E + .byte 0x8D,0x8C,0x8C,0x8B,0x8B,0x8A,0x89,0x89 + .byte 0x88,0x88,0x87,0x87,0x86,0x85,0x85,0x84 + .byte 0x84,0x83,0x83,0x82,0x82,0x81,0x81,0x80 + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/fcmp.h b/compiler-rt/lib/builtins/arm/fcmp.h new file mode 100644 index 0000000000000..4860479f45158 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fcmp.h @@ -0,0 +1,176 @@ +//===-- fcmp.h - shared code for single-precision FP comparison functions -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This code is the skeleton of a double-precision FP compare, with two details +// left out: which input value is in which register, and how to make the return +// value. It allows the main comparison logic to be shared between (for +// example) __lesf2 and __gesf2, varying only those details. +// +//===----------------------------------------------------------------------===// + +// How to use this header file: +// +// This header file is expected to be #included from inside a function +// definition in a .S file. The source file including this header should +// provide the following: +// +// op0 and op1: register aliases (via .req) for the registers containing the +// input operands. +// - For most comparisons, op0 will correspond to r0 and op1 to r1. +// - But a function with the reversed semantics of __aeabi_cfrcmple wil define +// them the other way round. +// +// SetReturnRegister: an assembly macro that looks at the PSR flags and sets up +// an appropriate return value in r0, for the cases that do *not* involve NaN. +// - On entry to this macro, the condition codes LO, EQ and HI indicate that +// op0 < op1, op0 == op1 or op0 > op1 respectively. +// - For functions that return a result in the flags, this macro can be empty, +// because those are the correct flags to return anyway. +// - Functions that return a boolean in r0 should set it up by checking the +// flags. +// +// LOCAL_LABEL(NaN): a label defined within the compare function, after the +// #include of this header. Called when at least one input is a NaN, and sets +// up the appropriate return value for that case. + +// -------------------------------------------------- +// The actual entry point of the compare function. +// +// The basic plan is to start by ORing together the two inputs. This tells us +// two things: +// - the top bit of the output tells us whether both inputs are positive, or +// whether at least one is negative +// - if the 8 exponent bits of the output are not all 1, then there are +// definitely no NaNs, so a fast path can handle most non-NaN cases. + +// clang-format off + + // First diverge control for the negative-numbers case. + orrs r12, op0, op1 + bmi LOCAL_LABEL(negative) // high bit set => at least one negative input + + // Here, both inputs are positive. Try adding 1<<23 to their bitwise OR in + // r12. This will carry all the way into the top bit, setting the N flag, if + // all 8 exponent bits were set. + cmn r12, #1 << 23 + bmi LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs + + // The fastest fast path: both inputs positive and we could easily tell there + // were no NaNs. So we just compare op0 and op1 as unsigned integers. + cmp op0, op1 + SetReturnRegister + bx lr + +LOCAL_LABEL(NaNInf_check_positive): + // Second tier for positive numbers. We come here if both inputs are + // positive, but our fast initial check didn't manage to rule out a NaN. But + // it's not guaranteed that there _is_ a NaN, for two reasons: + // + // 1. An input with exponent 0xFF might be an infinity instead. Those behave + // normally under comparison. + // + // 2. There might not even _be_ an input with exponent 0xFF. All we know so + // far is that the two inputs ORed together had all the exponent bits + // set. So each of those bits is set in _at least one_ of the inputs, but + // not necessarily all in the _same_ input. + // + // Test each exponent individually for 0xFF, using the same CMN idiom as + // above. If neither one carries into the sign bit then we have no NaNs _or_ + // infinities and can compare the registers and return again. + cmn op0, #1 << 23 + cmnpl op1, #1 << 23 + bmi LOCAL_LABEL(NaN_check_positive) + + // Second-tier return path, now we've ruled out anything difficult. + cmp op0, op1 + SetReturnRegister + bx lr + +LOCAL_LABEL(NaN_check_positive): + // Third tier for positive numbers. Here we know that at least one of the + // inputs has exponent 0xFF. But they might still be infinities rather than + // NaNs. So now we must check whether there's an actual NaN, by shifting each + // input left to get rid of the sign bit, and seeing if the result is + // _greater_ than 0xFF000000 (but not equal). + // + // We could have skipped the second-tier check and done this more rigorous + // test immediately. But that would cost an extra instruction in the case + // where there are no infinities or NaNs, and we assume that that is so much + // more common that it's worth optimizing for. + mov r12, #0xFF << 24 + cmp r12, op0, LSL #1 // if LO, then r12 < (op0 << 1), so op0 is a NaN + cmphs r12, op1, LSL #1 // if not LO, then do the same check for op1 + blo LOCAL_LABEL(NaN) // now, if LO, there's definitely a NaN + + // Now we've finally ruled out NaNs! And we still know both inputs are + // positive. So the third-tier return path can just compare the numbers + // again. + cmp op0, op1 + SetReturnRegister + bx lr + +LOCAL_LABEL(negative): + // We come here if at least one operand is negative. We haven't checked for + // NaNs at all yet (the sign check came first), so repeat the first-tier + // check strategy of seeing if all exponent bits are set in r12. + // + // On this path, the sign bit in r12 is set, so if adding 1 to the low + // exponent bit carries all the way through into the sign bit, it will + // _clear_ the sign bit rather than setting it. So we expect MI to be the + // "definitely no NaNs" result, where it was PL on the positive branch. + cmn r12, #1 << 23 + bpl LOCAL_LABEL(NaNInf_check_negative) + + // Now we have no NaNs, but at least one negative number. This gives us two + // complications: + // + // 1. Floating-point numbers are sign/magnitude, not two's complement, so we + // have to consider separately the cases of "both negative" and "one of + // each sign". + // + // 2. -0 and +0 are required to compare equal. + // + // But problem #1 is not as hard as it sounds! If both operands are negative, + // then we can get the result we want by comparing them as unsigned integers + // the opposite way round, because the input with the smaller value (as an + // integer) is the larger number in an FP ordering sense. And if one operand + // is negative and the other is positive, the _same_ reversed comparison + // works, because the positive number (with zero sign bit) will always + // compare less than the negative one in an unsigned-integers sense. + // + // So we only have to worry about problem #2, signed zeroes. This only + // affects the answer if _both_ operands are zero. And we can check that + // easily, because it happens if and only if r12 = 0x80000000. (We know r12 + // has its sign bit set; if it has no other bits set, that's because both + // inputs were either 0x80000000 or 0x00000000.) + cmp r12, #0x80000000 // EQ if both inputs are zero + cmpne op1, op0 // otherwise, compare them backwards + SetReturnRegister + bx lr + +LOCAL_LABEL(NaNInf_check_negative): + // Second tier for negative numbers: we know the OR of the exponents is 0xFF, + // but again, we might not have either _actual_ exponent 0xFF, and also, an + // exponent 0xFF might be an infinity instead of a NaN. + // + // On this path we've already branched twice (once for negative numbers and + // once for the first-tier NaN check), so we'll just go straight to the + // precise check for NaNs. + mov r12, #0xFF << 24 + cmp r12, op0, LSL #1 // if LO, then r12 < (op0 << 1), so op0 is a NaN + cmphs r12, op1, LSL #1 // if not LO, then do the same check for op1 + blo LOCAL_LABEL(NaN) + + // Now we've ruled out NaNs, so we can just compare the two input registers + // and return. On this path we _don't_ need to check for the special case of + // comparing two zeroes, because we only came here if the bitwise OR of the + // exponent fields was 0xFF, which means the exponents can't both have been + // zero! So we can _just_ do the reversed CMP and finish. + cmp op1, op0 + SetReturnRegister + bx lr diff --git a/compiler-rt/lib/builtins/arm/gedf2.S b/compiler-rt/lib/builtins/arm/gedf2.S new file mode 100644 index 0000000000000..18d99a312b00d --- /dev/null +++ b/compiler-rt/lib/builtins/arm/gedf2.S @@ -0,0 +1,61 @@ +//===-- gedf2.S - double-precision floating point comparison --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpdf2, except for its NaN +// handling. It's a three-way compare which returns <0 if x0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it +// returns <0, where __cmpdf2 would return >0. +// +// This also makes it suitable for use as __gtdf2 or __gedf2 (or __eqdf2 or +// __nedf2). +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" +#include "crt_endian.h" + + .syntax unified + .text + .p2align 2 + +op0h .req xh +op0l .req xl +op1h .req yh +op1l .req yl +.macro SetReturnRegister + mov r0, #0 + movhi r0, #1 + movlo r0, #-1 +.endm +.macro SetReturnRegisterNE + movne r0, #-1 + movhi r0, #1 +.endm + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__gedf2) + push {r4, lr} + VMOV_FROM_DOUBLE(r0, r1, d0) + VMOV_FROM_DOUBLE(r2, r3, d1) + bl __compiler_rt_softfp_gedf2 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gedf2, __compiler_rt_softfp_gedf2) +#endif +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtdf2, __gedf2) + +DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_gedf2) + #include "dcmp.h" + +LOCAL_LABEL(NaN): + mov r0, #-1 + bx lr + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gedf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/gesf2.S b/compiler-rt/lib/builtins/arm/gesf2.S new file mode 100644 index 0000000000000..c149eea589f05 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/gesf2.S @@ -0,0 +1,54 @@ +//===-- gesf2.S - single-precision floating point comparison --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpsf2, except for its NaN +// handling. It's a three-way compare which returns <0 if x0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it +// returns <0, where __cmpsf2 would return >0. +// +// This also makes it suitable for use as __gtsf2 or __gesf2 (or __eqsf2 or +// __nesf2). +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + .syntax unified + .text + .p2align 2 + +op0 .req r0 +op1 .req r1 +.macro SetReturnRegister + mov r0, #0 + movhi r0, #1 + movlo r0, #-1 +.endm + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__gesf2) + push {r4, lr} + vmov r0, s0 + vmov r1, s1 + bl __compiler_rt_softfp_gesf2 + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gesf2, __compiler_rt_softfp_gesf2) +#endif +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtsf2, __gesf2) + +DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_gesf2) + #include "fcmp.h" + +LOCAL_LABEL(NaN): + mov r0, #-1 + bx lr + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gesf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/muldf3.S b/compiler-rt/lib/builtins/arm/muldf3.S new file mode 100644 index 0000000000000..b73cd7580fbf2 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/muldf3.S @@ -0,0 +1,404 @@ +//===-- muldf3.S - double-precision floating point multiplication ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the __muldf3 function (double precision floating point +// multiplication), with the IEEE-754 default rounding (to nearest, ties to +// even), for the Arm and Thumb2 ISAs. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" +#include "crt_endian.h" + + .syntax unified + .text + .p2align 2 + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__muldf3) + push {r4, lr} + VMOV_FROM_DOUBLE(r0, r1, d0) + VMOV_FROM_DOUBLE(r2, r3, d1) + bl __aeabi_dmul + VMOV_TO_DOUBLE(d0, r0, r1) + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__muldf3, __aeabi_dmul) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_dmul) + + push {r4,r5,r6,lr} + + // Check if either input exponent is 000 or 7FF (i.e. not a normalized + // number), and if so, branch out of line. If we don't branch out of line, + // then we've also extracted the exponents of the input values x and y into + // bits 16..26 of r14 and r5 respectively. But if we do, then that hasn't + // necessarily been done (because the second AND might have been skipped). + ldr r12, =0x07FF0000 + ands r14, r12, xh, lsr #4 // sets Z if exponent of x is 0 + andsne r5, r12, yh, lsr #4 // otherwise, sets Z if exponent of y is 0 + teqne r14, r12 // otherwise, sets Z if exponent of x is 7FF + teqne r5, r12 // otherwise, sets Z if exponent of y is 7FF + beq LOCAL_LABEL(uncommon) // branch out of line to handle inf/NaN/0/denorm + + // Calculate the sign of the result, and put it in an unused bit of r14. + eor r4, xh, yh // XOR the input signs to get the result sign + orr r14, r14, r4, lsr #31 // save it in the low bit of r14 + + // Clear the exponent and sign bits from the top word of each mantissa, and + // set the leading mantissa bit in each one, so that they're in the right + // form to be multiplied. + bic xh, xh, r12, lsl #5 // r12 = 0x07FF0000, so r12 << 5 = 0xFF800000 + bic yh, yh, r12, lsl #5 + orr xh, xh, #(1 << 20) + orr yh, yh, #(1 << 20) + + // Now we're ready to multiply mantissas. This is also the place we'll come + // back to after decoding denormal inputs. The denormal decoding will also + // have to set up the same register contents: + // - fractions in xh/xl and yh/yl, with leading bits at bit 20 of xh/yh + // - exponents in r14 and r5, starting at bit 16 + // - output sign in r14 bit 0 +LOCAL_LABEL(mul): + + // Multiply the two mantissas as if they were full 64-bit words, delivering a + // 128-bit output in four registers. We provide three different ways to do + // this, using different instructions. + // + // Interleaved with the multiplication code, we also compute the output + // exponent by adding the input exponents and rebiasing. This takes two + // instructions. We schedule each one after a multiplication, to use a delay + // slot from the multiplication on CPUs where there is one. + // + // We add r5 to r14, so that the output exponent is in the top half of r14, + // and r5 is freed up to be used in the multiplication. + // + // We rebias the exponent by subtracting 0x400, which is correct for one of + // the two places where the leading bit of the product could end up, and will + // need correcting by one in the other case. + // + // Exit conditions from the three-way #if: + // + // r4:r5:r6 are the top 96 bits of the 128-bit product, with the leading bit + // at either bit 8 or bit 9 of r4. The low bit of r6 is forced to 1 if any of + // the low 32 bits of the 128-bit product were set. + // + // The output sign is still in the low bit of r14; the top half contains the + // preliminary output exponent (yet to be adjusted depending on where the + // high bit of the product ended up). + +#if __ARM_FEATURE_DSP + // The UMAAL instruction, which computes a 64-bit product and adds two + // separate 32-bit values to it, makes this easy. + umull r6, r4, xh, yl + add r14, r14, r5 // add exponents, freeing up r5 + umull r12, r5, xl, yl + sub r14, r14, #0x4000000 // initial rebiasing of exponent + umaal r6, r5, xl, yh + umaal r5, r4, xh, yh +#elif ARM_FP_DMUL_USE_UMLAL + // The UMLAL instruction computes a 64-bit product and adds a 64-bit value to + // it. But it doesn't write to the carry flag, so you can't tell if the + // addition wrapped. Therefore you have to use it in a way that means the + // addition never wraps. Here we do three of the four multiplications (xl*yl, + // xl*yh, xh*yh) in a chain, using UMLAL for the top two, in each case with + // the 64-bit accumulator consisting of the top half of the previous + // multiplication, and a high word set to zero before the UMLAL instruction. + // + // On Cortex-M3, this is not a win over just using UMULL and doing the + // additions by hand, because UMLAL takes two cycles longer than UMULL, and + // it also costs a cycle to initialise each of the two high accumulator words + // to zero. If the high word of the addend were not zero then those two + // cycles would be doing something useful, but as it is, they're wasted time. + // + // CPUs later than Cortex-M3 - in particular, Cortex-M4 - will do both UMLAL + // and UMULL much faster, so that this code is a win over the plain UMULL + // code below. But those CPUs typically have UMAAL anyway and will use the + // even faster version of the code above. So this code is provided in case + // it's useful, but won't be enabled unless you manually #define + // ARM_FP_DMUL_USE_UMLAL. + umull r12, r6, xl, yl + add r14, r14, r5 // add exponents, freeing up r5 + movs r5, #0 + umlal r6, r5, xl, yh + movs r4, #0 + umlal r5, r4, xh, yh + sub r14, r14, #0x4000000 // initial rebiasing of exponent + umull xl, yh, xh, yl + adds r6, r6, xl + adcs r5, r5, yh + adc r4, r4, #0 +#else + // Simplest approach, using plain UMULL to compute each 64-bit product, and + // separate ADD and ADC instructions to do the additions. On Cortex-M3 this + // wins over the UMLAL approach: it's one instruction longer, but three + // cycles quicker, since each use of UMLAL in the above version costs 2 + // cycles. + umull r4, r12, xh, yl + add r14, r14, r5 // add exponents, freeing up r5 + umull r6, r5, xl, yh + sub r14, r14, #0x4000000 // initial rebiasing of exponent + adds r6, r6, r4 + adcs r5, r5, r12 // carry from here is used below + + umull r4, r12, xh, yh // r12:r4 is top part + adc yh, r12, #0 // get carry from above addition + umull r12, xh, xl, yl // xh:r12 is bottom part + + adds r6, r6, xh + adcs r5, r5, r4 + adcs r4, yh, #0 +#endif + + // Now the full 128-bit product of the two mantissas occupies the four + // registers r4,r5,r6,r12 (in order from MSW to LSW). Since each input + // mantissa was in the range [2^52,2^53), the product is in the range + // [2^104,2^106), which means that the lowest-order word r12 is a long way + // below the round bit, so that it can only affect cases so close to a + // rounding boundary that you need to know if it's nonzero to tell whether + // you're rounding to even. Start by freeing up that register, ensuring the + // low bit of r6 is set if anything in r12 was nonzero. + tst r12, r12 + orrne r6, r6, #1 + + // Now we can regard the result as a 96-bit value in r4,r5,r6, with its + // leading bit in either bit 8 or 9 of r4. To move that bit up to its final + // position in bit 20, we must shift the whole thing left by either 11 or 12 + // bits. Find out which. + tst r4, #0x200 // is bit 9 set? + bne LOCAL_LABEL(shift11) // if so, only shift by 11 bits + + // In this branch, we're shifting left by 12 bits. Put the shifted result + // back into the output registers xh,xl, and the bits lower than the bottom + // mantissa bit into r4. + lsls xh, r4, #12 // shift each input reg left 12 + lsls xl, r5, #12 + lsls r4, r6, #12 + orr xh, xh, r5, lsr #20 // and the top two right by 32-12 + orr xl, xl, r6, lsr #20 + + b LOCAL_LABEL(shifted) + +LOCAL_LABEL(shift11): + // In this branch, we're shifting left by 11 bits instead of 12, and we must + // adjust the exponent by 1 to compensate. + lsls xh, r4, #11 // shift each input reg left 11 + lsls xl, r5, #11 + lsls r4, r6, #11 + orr xh, xh, r5, lsr #21 // and the top two right by 32-11 + orr xl, xl, r6, lsr #21 + add r14, r14, #0x10000 // adjust the exponent + +LOCAL_LABEL(shifted): + // We've reconverged after shifting the mantissa, so that now the leading 1 + // bit of the mantissa is in bit 20 of xh, and r4 contains the bits lower + // than the bottom of xl. + + // Recombine the sign and exponent into the high bits of xh. If the exponent + // is over- or underflowed, this may not give a valid FP result, but because + // everything is put on by addition, it will be right "mod 2^64" so that we + // can bias the exponent back into range for underflow handling and that will + // recover the right sign. + // + // r14 still has the output sign in its low bit. To extract just the exponent + // for adding to xh, we could use BIC to clear that bit, or shift the value + // right. We do the latter, which saves a copy of the pre-rounding exponent + // in yl, to use later for overflow detection. The shift is ASR, so that if + // the exponent is negative due to underflow, it stays negative. + asr yl, r14, #16 // isolate the exponent + add xh, xh, yl, lsl #20 // shift it back up to add to xh + add xh, xh, r14, lsl #31 // then add the sign + + // If we have to handle an underflow, we'll need enough information to + // reconstruct the rounding direction. Our strategy is + // + // - save the LSW of the output before rounding: if that differs from the + // LSW after rounding then we rounded up + // - save the round word r4: if that is zero then we didn't round at all. + // + // We're going to branch past the rounding code for a quicker exit in the + // case where we're exact. In that case we don't need to save the output LSW + // at all, because the zero round word will override whatever it would have + // been anyway. + movs r6, r4 // unconditionally save round word + beq LOCAL_LABEL(rounded) // branch past rounding code if exact + mov r5, xl // and if not, save output LSW too + + // Rounding: we shift r4 left to put the round bit into the carry flag so + // that ADCS+ADC will conditionally increment the mantissa. But before we do + // the additions, we also check the Z flag, which tells us whether the + // remaining 31 bits are all zero. If so, we're either in the round-to-even + // (RTE) halfway case, or the exact case - but the exact case never came + // through this code at all, so it must be RTE. + // + // If those 31 bits _aren't_ all zero, we clear the top bit of r4, leaving it + // set only in the round-to-even case. Then (r4 >> 31) can be used to clear + // the low bit to perform RTE. + lsls r12, r4, #1 // test round word + bicne r4, r4, #0x80000000 // make top bit of r4 into the RTE bit + adcs xl, xl, #0 // conditionally increment the mantissa + adc xh, xh, #0 // ... and carry into its high word + bic xl, xl, r4, lsr #31 // round to even if r4[31] != 0 + +LOCAL_LABEL(rounded): + // Now we've rounded the output. The last thing we must do is check for + // overflow and underflow: if neither has happened, we can return. + // + // yl contains the pre-rounding output exponent minus 1 (so that the leading + // mantissa bit incremented it to the right output value). If this is in the + // range [0,0x7fd] then the leading bit would have incremented it to + // [1,0x7fe], which are non-overflowed output exponents. So an unsigned check + // if yl >= 0x7fe detects both overflow and underflow at once. + movw r12, #0x7FE + cmp yl, r12 + poplo {r4,r5,r6,pc} + + // We have either an underflow or an overflow. We can tell which it is by + // doing a _signed_ comparison of yl with the same value again - and since we + // only just did the CMP instruction, we can reuse the same flags. + bge LOCAL_LABEL(overflow) + + // Now we're dealing with an underflow. Set r2 to the rounding direction, by + // first checking xl against r5 (where we saved its pre-rounding value) to + // see if we rounded up or down, and then overriding that by checking r6 + // (where we saved the round word) to see if we didn't round at all. In the + // latter case the comparison against r5 will deliver nonsense, but then we + // overwrite it, so it doesn't matter. + cmp xl, r5 // did we modify the LSW, i.e. round up? + movne r2, #-1 // if so, the true value is a bit smaller + moveq r2, #+1 // else it's a bit bigger + cmp r6, #0 // except maybe we didn't round at all + moveq r2, #0 // in which case the true value is exact. + + // Add the IEEE 754 exponent bias, and tail-call __dunder to handle the rest + // of the job. + add xh, xh, #0x60000000 + pop {r4,r5,r6,lr} + b SYMBOL_NAME(__compiler_rt_dunder) + +LOCAL_LABEL(overflow): + // Here, we overflowed, so we must return an infinity of the correct sign. + // Rebias the exponent, which corrects the sign bit. + sub xh, xh, #0x60000000 + + // And pop our scratch registers before falling through into dmul_retinf. + pop {r4,r5,r6,lr} + +LOCAL_LABEL(retinf): + // This is entered from the overflow handler and also from cases with + // infinite inputs. It constructs an infinity, with sign bit equal to the + // high bit of xh. + // + // On entry to here, we expect not to have a stack frame any more, because + // one of our callers will have popped it already in order to conditionally + // tailcall __dnan2. + mov xl, #0 // clear low word + mvn xh, xh, lsr #31 // shift xh[31] down to bit 0, inverted + mvn xh, xh, lsl #11 // uninvert, and put exponent 0x7ff below it + lsl xh, xh, #20 // shift back up to the top + bx lr + +LOCAL_LABEL(uncommon): + // We come here from the entry point, if any input had exponent 0 or 0x7ff. + // First we must repeat the instruction from the entry point that sets up r5 + // with the exponent of y, this time unconditionally, so we know we have both + // exponents in the top halves of r14 and r5. + and r5, r12, yh, lsr #4 + + // Check if either exponent is 0x7ff, by comparing against the value left in + // r12 by the entry point. If so, branch away to handle NaNs and infinities. + teq r14, r12 + teqne r5, r12 + beq LOCAL_LABEL(naninf) + + // If we didn't branch, we're dealing with finite numbers, including a zero + // or a denormal or both. + // + // First save the output sign. + eor r6, xh, yh + + // Handle zeroes first, because if there's a zero we don't have to worry + // about denormals at all. + orrs r4, xl, xh, lsl #1 // is x zero? + orrsne r4, yl, yh, lsl #1 // or is y zero? + beq LOCAL_LABEL(retzero) // Return zero if so + + // Otherwise, delegate to __dnorm2 to handle denormals, converting them into + // a normalised mantissa and an out-of-range exponent. __dnorm2 expects the + // exponents at the bottom of their words instead of half way up, so shift + // down first, and back up again afterwards. + // + // This call clobbers r12, because we didn't bother to save it on the stack. + // That's fine, because we don't need the constant in it any more. When we go + // back to dmul_mul, that will use it as a scratch register. + lsr r4, r14, #16 + lsr r5, r5, #16 + push {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack + mov r0, sp // pass it by address + bl SYMBOL_NAME(__compiler_rt_dnorm2) + pop {r0, r1, r2, r3, r4, r5} + lsl r14, r4, #16 + lsls r5, r5, #16 + + // Put the output sign at the bottom of r14, the same place the fast path + // would have left it. Then rejoin the fast path. + orr r14, r14, r6, lsr #31 + b LOCAL_LABEL(mul) + +LOCAL_LABEL(retzero): + // Return an exact zero, with sign bit from the high bit of r6. + mov xl, #0 // low word is 0 + ands xh, r6, #0x80000000 // high word is 0 except for the sign + pop {r4,r5,r6,pc} + +LOCAL_LABEL(naninf): + // We come here knowing that at least one operand is either NaN or infinity. + // If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our + // stacked registers first: we won't need that much spare space any more, and + // it makes the tailcall easier if we've already done it. + pop {r4,r5,r6,lr} + + // A number is a NaN if its exponent is 0x7ff and at least one bit below that + // is set. The CMP + ADC pair here converts the two words xh:xl into a single + // word containing xh shifted up by one (throwing away the sign bit which + // makes no difference), with its low bit set if xl was nonzero. So if that + // is strictly greater than 0xffe00000, then x was a NaN. + cmp xl, #1 + adc r12, xh, xh + cmp r12, #0xFFE00000 + bhi SYMBOL_NAME(__compiler_rt_dnan2) + // Now check y in the same way. + cmp yl, #1 + adc r12, yh, yh + cmp r12, #0xFFE00000 + bhi SYMBOL_NAME(__compiler_rt_dnan2) + + // Now we know there are no NaNs. Therefore there's at least one infinity. If + // either operand is zero then we have inf * 0 = invalid operation and must + // return a NaN. + orrs r12, xl, xh, lsl #1 // are all bits of x zero except the sign? + beq LOCAL_LABEL(retnan) // if so, x == 0, so y == inf + orrs r12, yl, yh, lsl #1 // same check the other way round + beq LOCAL_LABEL(retnan) + + // If we have an infinity and no NaN, then we just return an infinity of the + // correct sign. + eor xh, xh, yh + b LOCAL_LABEL(retinf) + +LOCAL_LABEL(retnan): + // Return the default NaN, in the case where the inputs were 0 and infinity. + movw xh, 0x7ff8 + lsls xh, xh, #16 + mov xl, #0 + bx lr + +END_COMPILERRT_FUNCTION(__aeabi_dmul) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S b/compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S new file mode 100644 index 0000000000000..3047a6f22e2ce --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S @@ -0,0 +1,61 @@ +//===-- cmpdf2.S - double-precision floating point comparison -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpdf2: it's a three-way compare +// which returns <0 if x0 if x>y. If the result is +// unordered (i.e. x or y or both is NaN) then it returns >0. +// +// This also makes it suitable for use as all of __eqdf2, __nedf2, __ltdf2 or +// __ledf2. +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" +#include "../crt_endian.h" + + .syntax unified + .text + .p2align 2 + +op0h .req xh +op0l .req xl +op1h .req yh +op1l .req yl +.macro ReturnResult + bhi 0f + blo 1f + movs r0, #0 + // This macro is always called immediately before returning from the + // function, so it's safe to use the same return instruction here, instead of + // wasting time branching forward to the end of the macro. + pop {r4,r5,r6,pc} +0: + movs r0, #1 + pop {r4,r5,r6,pc} +1: + movs r0, #1 + rsbs r0, r0, #0 + pop {r4,r5,r6,pc} +.endm + +DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpdf2, __compiler_rt_softfp_cmpdf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__ledf2, __cmpdf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltdf2, __cmpdf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqdf2, __cmpdf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__nedf2, __cmpdf2) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_cmpdf2) + #include "dcmp.h" + +LOCAL_LABEL(NaN): + movs r0, #1 + pop {r4,r5,r6,pc} + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpdf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S b/compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S new file mode 100644 index 0000000000000..e4a5e08c35181 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S @@ -0,0 +1,55 @@ +//===-- cmpsf2.S - single-precision floating point comparison -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpsf2: it's a three-way compare +// which returns <0 if x0 if x>y. If the result is +// unordered (i.e. x or y or both is NaN) then it returns >0. +// +// This also makes it suitable for use as all of __eqsf2, __nesf2, __ltsf2 or +// __lesf2. +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" + + .syntax unified + .text + .p2align 2 + +op0 .req r0 +op1 .req r1 +.macro ReturnResult + bhi 0f + blo 1f + movs r0, #0 + bx lr +0: + movs r0, #1 + bx lr +1: + movs r0, #1 + rsbs r0, r0, #0 + bx lr +.endm + +DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpsf2, __compiler_rt_softfp_cmpsf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__lesf2, __cmpsf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltsf2, __cmpsf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqsf2, __cmpsf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__nesf2, __cmpsf2) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_cmpsf2) + #include "fcmp.h" + +LOCAL_LABEL(NaN): + movs r0, #1 + bx lr + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpsf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/dcmp.h b/compiler-rt/lib/builtins/arm/thumb1/dcmp.h new file mode 100644 index 0000000000000..d0c1e2ddcb489 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/dcmp.h @@ -0,0 +1,231 @@ +//===-- dcmp.h - shared code for double-precision FP comparison functions -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This code is the skeleton of a double-precision FP compare, with two details +// left out: which input value is in which register, and how to make the return +// value. It allows the main comparison logic to be shared between (for +// example) __ledf2 and __gedf2, varying only those details. +// +//===----------------------------------------------------------------------===// + +// How to use this header file: +// +// This header file is expected to be #included from inside a function +// definition in a .S file. The source file including this header should +// provide the following: +// +// op0h, op0l, op1h, op1l: register aliases (via .req) for the registers +// containing the input operands. +// - For most comparisons, op0h,op0l will correspond to xh,xl, and op1h,op1l +// to yh,yl (as defined in turn in crt_endian.h). +// - But a function with the reversed semantics of __aeabi_cdrcmple wil define +// them the other way round. +// +// ReturnResult: an assembly macro that looks at the PSR flags, sets up an +// appropriate return value in r0, and returns it, for the cases that do *not* +// involve NaN. +// - On entry to this macro, the condition codes LO, EQ and HI indicate that +// op0 < op1, op0 == op1 or op0 > op1 respectively. +// - For functions that return a result in the flags, this macro can just +// return immediately, because those are the correct flags to return anyway. +// - Functions that return a boolean in r0 should set it up by checking the +// flags. +// +// LOCAL_LABEL(NaN): a label defined within the compare function, after the +// #include of this header. Called when at least one input is a NaN, and sets +// up the appropriate return value for that case. + +// -------------------------------------------------- +// The actual entry point of the compare function. +// +// The basic plan is to start by ORing together the two inputs. This tells us +// two things: +// - the top bit of the output tells us whether both inputs are positive, or +// whether at least one is negative +// - if the 11 exponent bits of the output are not all 1, then there are +// definitely no NaNs, so a fast path can handle most non-NaN cases. + +// clang-format off + + push {r4,r5,r6,lr} + + // Set up the constant 1 << 20 in a register, which we'll need on all + // branches. + movs r5, #1 + lsls r5, r5, #20 + + // First diverge control for the negative-numbers case. + movs r4, op0h + orrs r4, r4, op1h + bmi LOCAL_LABEL(negative) // high bit set => at least one negative input + + // Here, both inputs are positive. Try adding 1<<20 to their bitwise OR in + // r4. This will carry all the way into the top bit, setting the N flag, if + // all 11 exponent bits were set. + cmn r4, r5 + bmi LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs + + // The fastest fast path: both inputs positive and we could easily tell there + // were no NaNs. So we just compare op0 and op1 as unsigned integers. + cmp op0h, op1h + beq LOCAL_LABEL(low_word_positive) + ReturnResult +LOCAL_LABEL(low_word_positive): + cmp op0l, op1l + ReturnResult + +LOCAL_LABEL(NaNInf_check_positive): + // Second tier for positive numbers. We come here if both inputs are + // positive, but our fast initial check didn't manage to rule out a NaN. But + // it's not guaranteed that there _is_ a NaN, for two reasons: + // + // 1. An input with exponent 0x7FF might be an infinity instead. Those + // behave normally under comparison. + // + // 2. There might not even _be_ an input with exponent 0x7FF. All we know so + // far is that the two inputs ORed together had all the exponent bits + // set. So each of those bits is set in _at least one_ of the inputs, but + // not necessarily all in the _same_ input. + // + // Test each exponent individually for 0x7FF, using the same CMN idiom as + // above. If neither one carries into the sign bit then we have no NaNs _or_ + // infinities and can compare the registers and return again. + cmn op0h, r5 + bmi LOCAL_LABEL(NaN_check_positive) + cmn op1h, r5 + bmi LOCAL_LABEL(NaN_check_positive) + + // Second-tier return path, now we've ruled out anything difficult. By this + // time we know that the two operands have different exponents (because the + // exponents' bitwise OR is 0x7FF but neither one is 0x7FF by itself, so each + // must have a set bit not present in the other). So we only need to compare + // the high words. + cmp op0h, op1h + ReturnResult + +LOCAL_LABEL(NaN_check_positive): + // Third tier for positive numbers. Here we know that at least one of the + // inputs has exponent 0x7FF. But they might still be infinities rather than + // NaNs. So now we must check whether there's an actual NaN. + // + // We do this by shifting the high word of each input left to get rid of the + // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in + // the low word. Then we check if the result is _greater_ than 0xFFE00000 + // (but not equal), via adding 0x00200000 to it and testing for the HI + // condition (carry flag set, but Z clear). + // + // We could have skipped the second-tier check and done this more rigorous + // test immediately. But that would cost an extra instruction in the case + // where there are no infinities or NaNs, and we assume that that is so much + // more common that it's worth optimizing for. + lsls r6, r5, #1 // set r6 = 1<<21 + cmp op0l, #1 // set C if op0l is nonzero + adcs op0h, op0h, op0h // shift op0h left, bringing in the C bit + cmn op0h, r6 // if HI, then op0 is a NaN + bhi LOCAL_LABEL(NaN) + cmp op1l, #1 // set C if op1l is nonzero + adcs op1h, op1h, op1h // shift op1h left, bringing in the C bit + cmn op1h, r6 // if HI, then op1 is a NaN + bhi LOCAL_LABEL(NaN) + + // Now we've finally ruled out NaNs! And we still know both inputs are + // positive. So the third-tier return path can just compare the top words + // again. (The fact that we've just shifted them left doesn't make a + // difference.) + cmp op0h, op1h + ReturnResult + +LOCAL_LABEL(negative): + // We come here if at least one operand is negative. We haven't checked for + // NaNs at all yet (the sign check came first), so repeat the first-tier + // check strategy of seeing if all exponent bits are set in r12. + // + // On this path, the sign bit in r12 is set, so if adding 1 to the low + // exponent bit carries all the way through into the sign bit, it will + // _clear_ the sign bit rather than setting it. So we expect MI to be the + // "definitely no NaNs" result, where it was PL on the positive branch. + cmn r4, r5 + bpl LOCAL_LABEL(NaNInf_check_negative) + + // Now we have no NaNs, but at least one negative number. This gives us two + // complications: + // + // 1. Floating-point numbers are sign/magnitude, not two's complement, so we + // have to consider separately the cases of "both negative" and "one of + // each sign". + // + // 2. -0 and +0 are required to compare equal. + // + // But problem #1 is not as hard as it sounds! If both operands are negative, + // then we can get the result we want by comparing them as unsigned integers + // the opposite way round, because the input with the smaller value (as an + // integer) is the larger number in an FP ordering sense. And if one operand + // is negative and the other is positive, the _same_ reversed comparison + // works, because the positive number (with zero sign bit) will always + // compare less than the negative one in an unsigned-integers sense. + // + // So we only have to worry about problem #2, signed zeroes. This only + // affects the answer if _both_ operands are zero. So we check that by + // testing all bits of both operands apart from the sign bit. + lsls r6, r4, #1 // logical OR of both high words except the signs + orrs r6, r6, op0l // combine that with the low word of op0 + orrs r6, r6, op1l // and op1, so now only EQ if both are zero + beq LOCAL_LABEL(equal) + // Now we've ruled out confusing zero cases, just compare the operands in + // reverse sense. + cmp op1h, op0h + beq LOCAL_LABEL(low_word_negative) + ReturnResult +LOCAL_LABEL(low_word_negative): + cmp op1l, op0l + ReturnResult + +LOCAL_LABEL(equal): + // We come here if we know the inputs are supposed to compare equal. Set up + // the flags by comparing a register with itself. + // + // (We might have come here via a BEQ, in which case we know Z=1, but we also + // need C=1 for our caller to get _all_ the right flags.) + cmp r0, r0 // compare a register with itself + ReturnResult + +LOCAL_LABEL(NaNInf_check_negative): + // Second tier for negative numbers: we know the OR of the exponents is 0xFF, + // but again, we might not have either _actual_ exponent 0xFF, and also, an + // exponent 0xFF might be an infinity instead of a NaN. + // + // On this path we've already branched twice (once for negative numbers and + // once for the first-tier NaN check), so we'll just go straight to the + // precise check for NaNs. + // + // Like the NaNInf_check_positive case, we do each NaN check by making a + // word consisting of (high word << 1) OR (1 if low word is nonzero). But + // unlike the positive case, we can't make those words _in place_, + // overwriting op0h and op1h themselves, because that would shift the sign + // bits off the top, and we still need the sign bits to get the comparison + // right. (In the positive case, we knew both sign bits were 0, enabling a + // shortcut.) + lsls r6, r5, #1 // set r6 = 1<<21 + movs r4, op0h // copy op0h into a scratch register to modify + cmp op0l, #1 // set C if op0l is nonzero + adcs r4, r4, r4 // shift left, bringing in the C bit + cmn r4, r6 // if HI, then op0 is a NaN + bhi LOCAL_LABEL(NaN) + movs r4, op1h // copy op1h into a scratch register to modify + cmp op1l, #1 // set C if op1l is nonzero + adcs r4, r4, r4 // shift left, bringing in the C bit + cmn r4, r6 // if HI, then op1 is a NaN + bhi LOCAL_LABEL(NaN) + + // Now we've ruled out NaNs, so we can just compare the two input registers + // and return. On this path we _don't_ need to check for the special case of + // comparing two zeroes, because we only came here if the bitwise OR of the + // exponent fields was 0x7FF, which means the exponents can't both have been + // zero! So we can _just_ do the reversed CMP and finish. + cmp op1h, op0h + ReturnResult diff --git a/compiler-rt/lib/builtins/arm/thumb1/fcmp.h b/compiler-rt/lib/builtins/arm/thumb1/fcmp.h new file mode 100644 index 0000000000000..7d85abae05129 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/fcmp.h @@ -0,0 +1,189 @@ +//===-- fcmp.h - shared code for single-precision FP comparison functions -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This code is the skeleton of a double-precision FP compare, with two details +// left out: which input value is in which register, and how to make the return +// value. It allows the main comparison logic to be shared between (for +// example) __lesf2 and __gesf2, varying only those details. +// +//===----------------------------------------------------------------------===// + +// How to use this header file: +// +// This header file is expected to be #included from inside a function +// definition in a .S file. The source file including this header should +// provide the following: +// +// op0 and op1: register aliases (via .req) for the registers containing the +// input operands. +// - For most comparisons, op0 will correspond to r0 and op1 to r1. +// - But a function with the reversed semantics of __aeabi_cfrcmple wil define +// them the other way round. +// +// ReturnResult: an assembly macro that looks at the PSR flags, sets up an +// appropriate return value in r0, and returns it, for the cases that do *not* +// involve NaN. +// - On entry to this macro, the condition codes LO, EQ and HI indicate that +// op0 < op1, op0 == op1 or op0 > op1 respectively. +// - For functions that return a result in the flags, this macro can just +// return immediately, because those are the correct flags to return anyway. +// - Functions that return a boolean in r0 should set it up by checking the +// flags. +// +// LOCAL_LABEL(NaN): a label defined within the compare function, after the +// #include of this header. Called when at least one input is a NaN, and sets +// up the appropriate return value for that case. + +// -------------------------------------------------- +// The actual entry point of the compare function. +// +// The basic plan is to start by ORing together the two inputs. This tells us +// two things: +// - the top bit of the output tells us whether both inputs are positive, or +// whether at least one is negative +// - if the 8 exponent bits of the output are not all 1, then there are +// definitely no NaNs, so a fast path can handle most non-NaN cases. + +// clang-format off + + // Set up the constant 1 << 23 in a register, which we'll need on all + // branches. + movs r3, #1 + lsls r3, r3, #23 + + // Diverge control for the negative-numbers case. + movs r2, op0 + orrs r2, r2, op1 + bmi LOCAL_LABEL(negative) // high bit set => at least one negative input + + // Here, both inputs are positive. Try adding 1<<23 to their bitwise OR in + // r2. This will carry all the way into the top bit, setting the N flag, if + // all 8 exponent bits were set. + cmn r2, r3 + bmi LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs + + // The fastest fast path: both inputs positive and we could easily tell there + // were no NaNs. So we just compare op0 and op1 as unsigned integers. + cmp op0, op1 + ReturnResult + +LOCAL_LABEL(NaNInf_check_positive): + // Second tier for positive numbers. We come here if both inputs are + // positive, but our fast initial check didn't manage to rule out a NaN. But + // it's not guaranteed that there _is_ a NaN, for two reasons: + // + // 1. An input with exponent 0xFF might be an infinity instead. Those behave + // normally under comparison. + // + // 2. There might not even _be_ an input with exponent 0xFF. All we know so + // far is that the two inputs ORed together had all the exponent bits + // set. So each of those bits is set in _at least one_ of the inputs, but + // not necessarily all in the _same_ input. + // + // Test each exponent individually for 0xFF, using the same CMN idiom as + // above. If neither one carries into the sign bit then we have no NaNs _or_ + // infinities and can compare the registers and return again. + cmn op0, r3 + bmi LOCAL_LABEL(NaN_check_positive) + cmn op1, r3 + bmi LOCAL_LABEL(NaN_check_positive) + + // Second-tier return path, now we've ruled out anything difficult. + cmp op0, op1 + ReturnResult + +LOCAL_LABEL(NaN_check_positive): + // Third tier for positive numbers. Here we know that at least one of the + // inputs has exponent 0xFF. But they might still be infinities rather than + // NaNs. So now we must check whether there's an actual NaN, by shifting each + // input left to get rid of the sign bit, and seeing if the result is + // _greater_ than 0xFF000000 (but not equal). + // + // We could have skipped the second-tier check and done this more rigorous + // test immediately. But that would cost an extra instruction in the case + // where there are no infinities or NaNs, and we assume that that is so much + // more common that it's worth optimizing for. + movs r2, #0xFF + lsls r2, r2, #24 + lsls r3, op0, #1 + cmp r3, r2 + bhi LOCAL_LABEL(NaN) + lsls r3, op1, #1 + cmp r3, r2 + bhi LOCAL_LABEL(NaN) + + // Now we've finally ruled out NaNs! And we still know both inputs are + // positive. So the third-tier return path can just compare the numbers + // again. + cmp op0, op1 + ReturnResult + +LOCAL_LABEL(negative): + // We come here if at least one operand is negative. We haven't checked for + // NaNs at all yet (the sign check came first), so repeat the first-tier + // check strategy of seeing if all exponent bits are set in r12. + // + // On this path, the sign bit in r12 is set, so if adding 1 to the low + // exponent bit carries all the way through into the sign bit, it will + // _clear_ the sign bit rather than setting it. So we expect MI to be the + // "definitely no NaNs" result, where it was PL on the positive branch. + cmn r2, r3 + bpl LOCAL_LABEL(NaNInf_check_negative) + + // Now we have no NaNs, but at least one negative number. This gives us two + // complications: + // + // 1. Floating-point numbers are sign/magnitude, not two's complement, so we + // have to consider separately the cases of "both negative" and "one of + // each sign". + // + // 2. -0 and +0 are required to compare equal. + // + // But problem #1 is not as hard as it sounds! If both operands are negative, + // then we can get the result we want by comparing them as unsigned integers + // the opposite way round, because the input with the smaller value (as an + // integer) is the larger number in an FP ordering sense. And if one operand + // is negative and the other is positive, the _same_ reversed comparison + // works, because the positive number (with zero sign bit) will always + // compare less than the negative one in an unsigned-integers sense. + // + // So we only have to worry about problem #2, signed zeroes. This only + // affects the answer if _both_ operands are zero. And we can check that + // easily, because it happens if and only if r12 = 0x80000000. (We know r12 + // has its sign bit set; if it has no other bits set, that's because both + // inputs were either 0x80000000 or 0x00000000.) + lsls r2, r2, #1 // EQ if both inputs are zero (also sets C) + beq 1f + cmp op1, op0 // otherwise, compare them backwards +1: + ReturnResult + +LOCAL_LABEL(NaNInf_check_negative): + // Second tier for negative numbers: we know the OR of the exponents is 0xFF, + // but again, we might not have either _actual_ exponent 0xFF, and also, an + // exponent 0xFF might be an infinity instead of a NaN. + // + // On this path we've already branched twice (once for negative numbers and + // once for the first-tier NaN check), so we'll just go straight to the + // precise check for NaNs. + movs r2, #0xFF + lsls r2, r2, #24 + lsls r3, op0, #1 + cmp r3, r2 + bhi LOCAL_LABEL(NaN) + lsls r3, op1, #1 + cmp r3, r2 + bhi LOCAL_LABEL(NaN) + + // Now we've ruled out NaNs, so we can just compare the two input registers + // and return. On this path we _don't_ need to check for the special case of + // comparing two zeroes, because we only came here if the bitwise OR of the + // exponent fields was 0xFF, which means the exponents can't both have been + // zero! So we can _just_ do the reversed CMP and finish. + cmp op1, op0 + ReturnResult diff --git a/compiler-rt/lib/builtins/arm/thumb1/gedf2.S b/compiler-rt/lib/builtins/arm/thumb1/gedf2.S new file mode 100644 index 0000000000000..3673f24b5a160 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/gedf2.S @@ -0,0 +1,60 @@ +//===-- gedf2.S - double-precision floating point comparison --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpdf2, except for its NaN +// handling. It's a three-way compare which returns <0 if x0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it +// returns <0, where __cmpdf2 would return >0. +// +// This also makes it suitable for use as __gtdf2 or __gedf2 (or __eqdf2 or +// __nedf2). +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" +#include "../crt_endian.h" + + .syntax unified + .text + .p2align 2 + +op0h .req xh +op0l .req xl +op1h .req yh +op1l .req yl +.macro ReturnResult + bhi 0f + blo 1f + movs r0, #0 + // This macro is always called immediately before returning from the + // function, so it's safe to use the same return instruction here, instead of + // wasting time branching forward to the end of the macro. + pop {r4,r5,r6,pc} +0: + movs r0, #1 + pop {r4,r5,r6,pc} +1: + movs r0, #1 + rsbs r0, r0, #0 + pop {r4,r5,r6,pc} +.endm + +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gedf2, __compiler_rt_softfp_gedf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtdf2, __gedf2) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_gedf2) + #include "dcmp.h" + +LOCAL_LABEL(NaN): + movs r0, #1 + rsbs r0, r0, #0 + pop {r4,r5,r6,pc} + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gedf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/gesf2.S b/compiler-rt/lib/builtins/arm/thumb1/gesf2.S new file mode 100644 index 0000000000000..3830b6cb21c29 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/gesf2.S @@ -0,0 +1,54 @@ +//===-- gesf2.S - single-precision floating point comparison --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This function has the semantics of GNU __cmpsf2, except for its NaN +// handling. It's a three-way compare which returns <0 if x0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it +// returns <0, where __cmpsf2 would return >0. +// +// This also makes it suitable for use as __gtsf2 or __gesf2 (or __eqsf2 or +// __nesf2). +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" + + .syntax unified + .text + .p2align 2 + +op0 .req r0 +op1 .req r1 +.macro ReturnResult + bhi 0f + blo 1f + movs r0, #0 + bx lr +0: + movs r0, #1 + bx lr +1: + movs r0, #1 + rsbs r0, r0, #0 + bx lr +.endm + +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gesf2, __compiler_rt_softfp_gesf2) +DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtsf2, __gesf2) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_gesf2) + #include "fcmp.h" + +LOCAL_LABEL(NaN): + movs r0, #1 + rsbs r0, r0, #0 + bx lr + +END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gesf2) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/unorddf2.S b/compiler-rt/lib/builtins/arm/thumb1/unorddf2.S new file mode 100644 index 0000000000000..d3f4a1d4f27e9 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/unorddf2.S @@ -0,0 +1,60 @@ +//===-- unorddf2.S - double-precision floating point comparison -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Return 1 if the result of comparing x with y is 'unordered', i.e. +// one of x and y is NaN. +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" +#include "../crt_endian.h" + + .syntax unified + .text + .p2align 2 + +DEFINE_COMPILERRT_FUNCTION_ALIAS(__unorddf2, __aeabi_dcmpun) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_dcmpun) + + // This function isn't based on the general-purpose code in dcmp.h, because + // it's more effort than needed. Here we just need to identify whether or not + // there's at least one NaN in the inputs. There's no need to vary that check + // based on the sign bit, so we might as well just do the NaN test as quickly + // as possible. + // + // We do this by shifting the high word of each input left to get rid of the + // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in + // the low word. Then we check if the result is _greater_ than 0xFFE00000 + // (but not equal), via adding 0x00200000 to it and testing for the HI + // condition (carry flag set, but Z clear). + // + // Once we've done that transformation to the first input xh:xl, we + // free up xl to contain our constant 0x00200000, so there's no need + // to push any registers. + cmp xl, #1 // set C if xl is nonzero + adcs xh, xh, xh // shift xh left, bringing in the C bit + movs xl, #1 // now xl is free, make the test constant + lsls xl, xl, #21 // by shifting 1 left to make 0x00200000 + cmn xh, xl // HI if x is a NaN + bhi LOCAL_LABEL(NaN) + cmp yl, #1 // set C if yl is nonzero + adcs yh, yh, yh // shift yh left, bringing in the C bit + cmn yh, xl // HI if y is a NaN + bhi LOCAL_LABEL(NaN) + + movs r0, #0 + bx lr + +LOCAL_LABEL(NaN): + movs r0, #1 + bx lr + +END_COMPILERRT_FUNCTION(__aeabi_dcmpun) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/unordsf2.S b/compiler-rt/lib/builtins/arm/thumb1/unordsf2.S new file mode 100644 index 0000000000000..5d74e0fdfe159 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/unordsf2.S @@ -0,0 +1,49 @@ +//===-- unordsf2.S - single-precision floating point comparison -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Return 1 if the result of comparing x with y is 'unordered', i.e. +// one of x and y is NaN. +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" + + .syntax unified + .text + .p2align 2 + +DEFINE_COMPILERRT_FUNCTION_ALIAS(__unordsf2, __aeabi_fcmpun) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_fcmpun) + + // This function isn't based on the general-purpose code in fcmp.h, because + // it's more effort than needed. Here we just need to identify whether or not + // there's at least one NaN in the inputs. There's no need to vary that check + // based on the sign bit, so we might as well just do the NaN test as quickly + // as possible. + movs r2, #0xFF + lsls r2, r2, #24 + lsls r3, r0, #1 + cmp r3, r2 + bhi LOCAL_LABEL(NaN) + lsls r3, r1, #1 + cmp r3, r2 + bhi LOCAL_LABEL(NaN) + + // If HS, then we have no NaNs and return false. + movs r0, #0 + bx lr + + // Otherwise, we have at least one NaN, and return true. +LOCAL_LABEL(NaN): + movs r0, #1 + bx lr + +END_COMPILERRT_FUNCTION(__aeabi_fcmpun) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/unorddf2.S b/compiler-rt/lib/builtins/arm/unorddf2.S new file mode 100644 index 0000000000000..8816d4073b4f3 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/unorddf2.S @@ -0,0 +1,71 @@ +//===-- unorddf2.S - double-precision floating point comparison -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Return 1 if the result of comparing x with y is 'unordered', i.e. +// one of x and y is NaN. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" +#include "crt_endian.h" + + + .syntax unified + .text + .p2align 2 + + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__unorddf2) + push {r4, lr} + VMOV_FROM_DOUBLE(r0, r1, d0) + VMOV_FROM_DOUBLE(r2, r3, d1) + bl __aeabi_dcmpun + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__unorddf2, __aeabi_dcmpun) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_dcmpun) + + // This function isn't based on the general-purpose code in dcmp.h, because + // it's more effort than needed. Here we just need to identify whether or not + // there's at least one NaN in the inputs. There's no need to vary that check + // based on the sign bit, so we might as well just do the NaN test as quickly + // as possible. + // + // We do this by shifting the high word of each input left to get rid of the + // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in + // the low word. Then we check if the result is _greater_ than 0xFFE00000 + // (but not equal), via adding 0x00200000 to it and testing for the HI + // condition (carry flag set, but Z clear). + // + // Once we've done that transformation to the first input xh:xl, we + // free up xl to contain our constant 0x00200000, so there's no need + // to push any registers. + cmp xl, #1 // set C if xl is nonzero + adc xh, xh, xh // shift xh left, bringing in the C bit + cmp yl, #1 // set C if yl is nonzero + adc yh, yh, yh // shift yh left, bringing in the C bit + cmn xh, #1 << 21 // if HI, then x is a NaN + cmnls yh, #1 << 21 // if not HI, then do the same check for y + + // If LS, then we have no NaNs and return false. We do this as quickly as we + // can (not stopping to take two instructions setting up r0 for both + // possibilities), on the assumption that NaNs are rare and we want to + // optimize for the non-NaN path. + movls r0, #0 + bxls lr + + // Otherwise, we have at least one NaN, and return true. + mov r0, #1 + bx lr + +END_COMPILERRT_FUNCTION(__aeabi_dcmpun) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/unordsf2.S b/compiler-rt/lib/builtins/arm/unordsf2.S new file mode 100644 index 0000000000000..1930996779888 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/unordsf2.S @@ -0,0 +1,56 @@ +//===-- unordsf2.S - single-precision floating point comparison -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Return 1 if the result of comparing x with y is 'unordered', i.e. +// one of x and y is NaN. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + + .syntax unified + .text + .p2align 2 + +#if __ARM_PCS_VFP +DEFINE_COMPILERRT_FUNCTION(__unordsf2) + push {r4, lr} + vmov r0, s0 + vmov r1, s1 + bl __aeabi_fcmpun + pop {r4, pc} +#else +DEFINE_COMPILERRT_FUNCTION_ALIAS(__unordsf2, __aeabi_fcmpun) +#endif + +DEFINE_COMPILERRT_FUNCTION(__aeabi_fcmpun) + + // This function isn't based on the general-purpose code in fcmp.h, because + // it's more effort than needed. Here we just need to identify whether or not + // there's at least one NaN in the inputs. There's no need to vary that check + // based on the sign bit, so we might as well just do the NaN test as quickly + // as possible. + mov r12, #0xFF << 24 + cmp r12, r0, lsl #1 // if LO, then r12 < (r0 << 1), so r0 is a NaN + cmphs r12, r1, lsl #1 // if not LO, then do the same check for r1 + + // If HS, then we have no NaNs and return false. We do this as quickly as we + // can (not stopping to take two instructions setting up r0 for both + // possibilities), on the assumption that NaNs are rare and we want to + // optimize for the non-NaN path. + movhs r0, #0 + bxhs lr + + // Otherwise, we have at least one NaN, and return true. + mov r0, #1 + bx lr + +END_COMPILERRT_FUNCTION(__aeabi_fcmpun) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp b/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp index 0f934b87c38cf..591611edc425a 100644 --- a/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp @@ -121,6 +121,19 @@ TEST(ScudoFlagsTest, AllocatorFlags) { EXPECT_EQ(2048, Flags.quarantine_max_chunk_size); } +TEST(ScudoFlagsTest, InitFlagsEnv) { + const char *OldValue = getenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE"); + setenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE", "123", 1); + scudo::initFlags(); + scudo::Flags *F = scudo::getFlags(); + EXPECT_EQ(123, F->allocation_ring_buffer_size); + if (OldValue) { + setenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE", OldValue, 1); + } else { + unsetenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE"); + } +} + #ifdef GWP_ASAN_HOOKS TEST(ScudoFlagsTest, GWPASanFlags) { scudo::FlagParser Parser; diff --git a/compiler-rt/test/builtins/Unit/comparedf2new_test.c b/compiler-rt/test/builtins/Unit/comparedf2new_test.c new file mode 100644 index 0000000000000..d337cb3db4b0f --- /dev/null +++ b/compiler-rt/test/builtins/Unit/comparedf2new_test.c @@ -0,0 +1,619 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_comparedf2 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +COMPILER_RT_ABI int __eqdf2(double, double); +COMPILER_RT_ABI int __nedf2(double, double); +COMPILER_RT_ABI int __gedf2(double, double); +COMPILER_RT_ABI int __gtdf2(double, double); +COMPILER_RT_ABI int __ledf2(double, double); +COMPILER_RT_ABI int __ltdf2(double, double); +COMPILER_RT_ABI int __cmpdf2(double, double); +COMPILER_RT_ABI int __unorddf2(double, double); + +enum Result { RESULT_LT, RESULT_GT, RESULT_EQ, RESULT_UN }; + +int expect(uint64_t a_rep, uint64_t b_rep, const char *name, int result, int ok, + const char *expected, int line) { + if (!ok) + printf("error at line %d: %s(%016" PRIx64 ", %016" PRIx64 + ") = %d, expected %s\n", + line, name, a_rep, b_rep, result, expected); + return !ok; +} + +int test__comparedf2(uint64_t a_rep, uint64_t b_rep, enum Result result, + int line) { + double a = fromRep64(a_rep), b = fromRep64(b_rep); + + int eq = __eqdf2(a, b); + int ne = __nedf2(a, b); + int ge = __gedf2(a, b); + int gt = __gtdf2(a, b); + int le = __ledf2(a, b); + int lt = __ltdf2(a, b); +#ifdef __ELF__ + // The generic builtins/comparedf2.c does not define this function + // for object formats other than ELF + int cmp = __cmpdf2(a, b); +#endif + int unord = __unorddf2(a, b); + + int ret = 0; + + switch (result) { + case RESULT_LT: + ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__nedf2", ne, ne != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__gedf2", ge, ge < 0, "< 0", line); + ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__ledf2", le, le <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt < 0, "< 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == -1, "== -1", line); +#endif + ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 0, "== 0", line); + break; + case RESULT_GT: + ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__nedf2", ne, ne != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__gedf2", ge, ge >= 0, ">= 0", line); + ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt > 0, "> 0", line); + ret |= expect(a_rep, b_rep, "__ledf2", le, le > 0, "> 0", line); + ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt >= 0, ">= 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == 1, "== 1", line); +#endif + ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 0, "== 0", line); + break; + case RESULT_EQ: + ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq == 0, "== 0", line); + ret |= expect(a_rep, b_rep, "__nedf2", ne, ne == 0, "== 0", line); + ret |= expect(a_rep, b_rep, "__gedf2", ge, ge >= 0, ">= 0", line); + ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__ledf2", le, le <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt >= 0, ">= 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == 0, "== 0", line); +#endif + ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 0, "== 0", line); + break; + case RESULT_UN: + ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__nedf2", ne, ne != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__gedf2", ge, ge < 0, "< 0", line); + ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__ledf2", le, le > 0, "> 0", line); + ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt >= 0, ">= 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == 1, "== 1", line); +#endif + ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 1, "== 1", line); + break; + } + + return ret; +} + +#define test__comparedf2(a, b, x) test__comparedf2(a, b, x, __LINE__) + +int main(void) { + int status = 0; + + status |= test__comparedf2(0x0000000000000000, 0x0000000000000001, RESULT_LT); + status |= test__comparedf2(0x0000000000000000, 0x000fffffffffffff, RESULT_LT); + status |= test__comparedf2(0x0000000000000000, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000000, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000000, 0x7ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000000, 0x7ff00000a5a42e09, RESULT_UN); + status |= test__comparedf2(0x0000000000000000, 0x7ffcd5b95f9b89ae, RESULT_UN); + status |= test__comparedf2(0x0000000000000000, 0x7ffcd5b95f9b89ae, RESULT_UN); + status |= test__comparedf2(0x0000000000000000, 0x8000000000000000, RESULT_EQ); + status |= test__comparedf2(0x0000000000000000, 0x8000000000000001, RESULT_GT); + status |= test__comparedf2(0x0000000000000000, 0x800fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0000000000000000, 0x8010000000000000, RESULT_GT); + status |= test__comparedf2(0x0000000000000000, 0xfff0000000000000, RESULT_GT); + status |= test__comparedf2(0x0000000000000000, 0xfff00000a5a42e09, RESULT_UN); + status |= test__comparedf2(0x0000000000000000, 0xfffcd5b95f9b89ae, RESULT_UN); + status |= test__comparedf2(0x0000000000000000, 0xfffcd5b95f9b89ae, RESULT_UN); + status |= test__comparedf2(0x0000000000000001, 0x0000000000000001, RESULT_EQ); + status |= test__comparedf2(0x0000000000000001, 0x3fefffffffffffff, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x3ffffffffffffffe, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x3fffffffffffffff, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x7fdfffffffffffff, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x7feffffffffffffe, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x7fefffffffffffff, RESULT_LT); + status |= test__comparedf2(0x0000000000000001, 0x7ff00000887bcf03, RESULT_UN); + status |= test__comparedf2(0x0000000000000001, 0x7ff753b1887bcf03, RESULT_UN); + status |= test__comparedf2(0x0000000000000001, 0x7ffc3134b058fe20, RESULT_UN); + status |= test__comparedf2(0x0000000000000001, 0x8000000000000001, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xbfefffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xbffffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xbfffffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xffdfffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xffeffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xffefffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0000000000000001, 0xfff00000887bcf03, RESULT_UN); + status |= test__comparedf2(0x0000000000000001, 0xfff753b1887bcf03, RESULT_UN); + status |= test__comparedf2(0x0000000000000001, 0xfffc3134b058fe20, RESULT_UN); + status |= test__comparedf2(0x0000000000000002, 0x0000000000000001, RESULT_GT); + status |= test__comparedf2(0x0000000000000003, 0x0000000000000002, RESULT_GT); + status |= test__comparedf2(0x0000000000000003, 0x4008000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000003, 0x4014000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000003, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0x0000000000000003, 0xc014000000000000, RESULT_GT); + status |= test__comparedf2(0x0000000000000003, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x0000000000000004, 0x0000000000000004, RESULT_EQ); + status |= test__comparedf2(0x000ffffffffffffc, 0x800ffffffffffffc, RESULT_GT); + status |= test__comparedf2(0x000ffffffffffffd, 0x000ffffffffffffe, RESULT_LT); + status |= test__comparedf2(0x000fffffffffffff, 0x0000000000000000, RESULT_GT); + status |= test__comparedf2(0x000fffffffffffff, 0x000ffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x000fffffffffffff, 0x000fffffffffffff, RESULT_EQ); + status |= test__comparedf2(0x000fffffffffffff, 0x0010000000000000, RESULT_LT); + status |= test__comparedf2(0x000fffffffffffff, 0x7ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x000fffffffffffff, 0x7ff00000dfe15ee3, RESULT_UN); + status |= test__comparedf2(0x000fffffffffffff, 0x7ff6d1ebdfe15ee3, RESULT_UN); + status |= test__comparedf2(0x000fffffffffffff, 0x7ffed0664505a878, RESULT_UN); + status |= test__comparedf2(0x000fffffffffffff, 0x8000000000000000, RESULT_GT); + status |= test__comparedf2(0x000fffffffffffff, 0xfff0000000000000, RESULT_GT); + status |= test__comparedf2(0x000fffffffffffff, 0xfff00000dfe15ee3, RESULT_UN); + status |= test__comparedf2(0x000fffffffffffff, 0xfff6d1ebdfe15ee3, RESULT_UN); + status |= test__comparedf2(0x000fffffffffffff, 0xfffed0664505a878, RESULT_UN); + status |= test__comparedf2(0x0010000000000000, 0x0000000000000000, RESULT_GT); + status |= test__comparedf2(0x0010000000000000, 0x0010000000000000, RESULT_EQ); + status |= test__comparedf2(0x0010000000000000, 0x8010000000000000, RESULT_GT); + status |= test__comparedf2(0x0010000000000001, 0x0010000000000000, RESULT_GT); + status |= test__comparedf2(0x0010000000000001, 0x0010000000000002, RESULT_LT); + status |= test__comparedf2(0x001fffffffffffff, 0x0020000000000000, RESULT_LT); + status |= test__comparedf2(0x001fffffffffffff, 0x0020000000000002, RESULT_LT); + status |= test__comparedf2(0x001fffffffffffff, 0x0020000000000004, RESULT_LT); + status |= test__comparedf2(0x0020000000000000, 0x001fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0020000000000001, 0x0010000000000001, RESULT_GT); + status |= test__comparedf2(0x0020000000000001, 0x001fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0020000000000002, 0x0010000000000001, RESULT_GT); + status |= test__comparedf2(0x002fffffffffffff, 0x0030000000000000, RESULT_LT); + status |= test__comparedf2(0x0030000000000000, 0x002fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0030000000000001, 0x002fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x0030000000000002, 0x0020000000000003, RESULT_GT); + status |= test__comparedf2(0x3fe0000000000000, 0x3fe0000000000000, RESULT_EQ); + status |= test__comparedf2(0x3fefffffffffffff, 0x0000000000000001, RESULT_GT); + status |= test__comparedf2(0x3fefffffffffffff, 0x8000000000000001, RESULT_GT); + status |= test__comparedf2(0x3ff0000000000000, 0x3ff0000000000000, RESULT_EQ); + status |= test__comparedf2(0x3ff0000000000000, 0x3ff0000000000003, RESULT_LT); + status |= test__comparedf2(0x3ff0000000000000, 0x4000000000000000, RESULT_LT); + status |= test__comparedf2(0x3ff0000000000000, 0x401c000000000000, RESULT_LT); + status |= test__comparedf2(0x3ff0000000000000, 0x7ff0000033022725, RESULT_UN); + status |= test__comparedf2(0x3ff0000000000000, 0x7ff4f5ad33022725, RESULT_UN); + status |= test__comparedf2(0x3ff0000000000000, 0x7ffd3870667efc9d, RESULT_UN); + status |= test__comparedf2(0x3ff0000000000000, 0x8000000000000000, RESULT_GT); + status |= test__comparedf2(0x3ff0000000000000, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x3ff0000000000000, 0xbff0000000000003, RESULT_GT); + status |= test__comparedf2(0x3ff0000000000000, 0xfff0000033022725, RESULT_UN); + status |= test__comparedf2(0x3ff0000000000000, 0xfff4f5ad33022725, RESULT_UN); + status |= test__comparedf2(0x3ff0000000000000, 0xfffd3870667efc9d, RESULT_UN); + status |= test__comparedf2(0x3ff0000000000001, 0x3ff0000000000000, RESULT_GT); + status |= test__comparedf2(0x3ff0000000000001, 0x3ff0000000000002, RESULT_LT); + status |= test__comparedf2(0x3ff0000000000001, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x3ffffffffffffffc, 0x3ffffffffffffffd, RESULT_LT); + status |= test__comparedf2(0x3fffffffffffffff, 0x0000000000000001, RESULT_GT); + status |= test__comparedf2(0x3fffffffffffffff, 0x4000000000000000, RESULT_LT); + status |= test__comparedf2(0x4000000000000000, 0x3ff0000000000000, RESULT_GT); + status |= test__comparedf2(0x4000000000000000, 0x3fffffffffffffff, RESULT_GT); + status |= test__comparedf2(0x4000000000000000, 0x4000000000000000, RESULT_EQ); + status |= test__comparedf2(0x4000000000000000, 0x4000000000000001, RESULT_LT); + status |= test__comparedf2(0x4000000000000000, 0xc000000000000000, RESULT_GT); + status |= test__comparedf2(0x4000000000000000, 0xc000000000000001, RESULT_GT); + status |= test__comparedf2(0x4000000000000000, 0xc014000000000000, RESULT_GT); + status |= test__comparedf2(0x4000000000000001, 0x3ff0000000000001, RESULT_GT); + status |= test__comparedf2(0x4000000000000001, 0x4000000000000002, RESULT_LT); + status |= test__comparedf2(0x4000000000000001, 0xc000000000000002, RESULT_GT); + status |= test__comparedf2(0x4000000000000002, 0x3ff0000000000001, RESULT_GT); + status |= test__comparedf2(0x4000000000000002, 0x3ff0000000000003, RESULT_GT); + status |= test__comparedf2(0x4000000000000004, 0x4000000000000003, RESULT_GT); + status |= test__comparedf2(0x4008000000000000, 0x4008000000000000, RESULT_EQ); + status |= test__comparedf2(0x400fffffffffffff, 0x400ffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x400fffffffffffff, 0x4010000000000002, RESULT_LT); + status |= test__comparedf2(0x4010000000000001, 0x400fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x4014000000000000, 0x0000000000000000, RESULT_GT); + status |= test__comparedf2(0x4014000000000000, 0x8000000000000000, RESULT_GT); + status |= test__comparedf2(0x4014000000000000, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x4014000000000000, 0xc014000000000000, RESULT_GT); + status |= test__comparedf2(0x7fb0000000000001, 0x7fafffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7fcfffffffffffff, 0x7fcffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x7fcfffffffffffff, 0x7fd0000000000002, RESULT_LT); + status |= test__comparedf2(0x7fd0000000000000, 0x7fcfffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7fd0000000000000, 0x7fd0000000000000, RESULT_EQ); + status |= test__comparedf2(0x7fd0000000000000, 0x7fd0000000000001, RESULT_LT); + status |= test__comparedf2(0x7fd0000000000001, 0x7fd0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fd0000000000001, 0x7fe0000000000001, RESULT_LT); + status |= test__comparedf2(0x7fd0000000000001, 0xffd0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fd0000000000002, 0x7fc0000000000003, RESULT_GT); + status |= test__comparedf2(0x7fd0000000000004, 0x7fd0000000000003, RESULT_GT); + status |= test__comparedf2(0x7fdffffffffffffe, 0x7fdffffffffffffe, RESULT_EQ); + status |= test__comparedf2(0x7fdffffffffffffe, 0x7fdfffffffffffff, RESULT_LT); + status |= test__comparedf2(0x7fdffffffffffffe, 0xffdfffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7fdfffffffffffff, 0x3ff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fdfffffffffffff, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0x7fdfffffffffffff, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fdfffffffffffff, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fe0000000000000, 0x3ff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fe0000000000000, 0x7fe0000000000000, RESULT_EQ); + status |= test__comparedf2(0x7fe0000000000000, 0x7ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x7fe0000000000000, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fe0000000000000, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fe0000000000000, 0xfff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fe0000000000001, 0x7fe0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fe0000000000001, 0x7fe0000000000002, RESULT_LT); + status |= test__comparedf2(0x7fe0000000000001, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fe0000000000002, 0x7fd0000000000001, RESULT_GT); + status |= test__comparedf2(0x7feffffffffffffe, 0x3ff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7feffffffffffffe, 0x7fefffffffffffff, RESULT_LT); + status |= test__comparedf2(0x7feffffffffffffe, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7feffffffffffffe, 0xffefffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7fefffffffffffff, 0x0000000000000001, RESULT_GT); + status |= test__comparedf2(0x7fefffffffffffff, 0x3ff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fefffffffffffff, 0x7fefffffffffffff, RESULT_EQ); + status |= test__comparedf2(0x7fefffffffffffff, 0x7ff00000c901461b, RESULT_UN); + status |= test__comparedf2(0x7fefffffffffffff, 0x7ff784a9c901461b, RESULT_UN); + status |= test__comparedf2(0x7fefffffffffffff, 0x7ffe2c1db2e4a313, RESULT_UN); + status |= test__comparedf2(0x7fefffffffffffff, 0x8000000000000001, RESULT_GT); + status |= test__comparedf2(0x7fefffffffffffff, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7fefffffffffffff, 0xfff00000c901461b, RESULT_UN); + status |= test__comparedf2(0x7fefffffffffffff, 0xfff784a9c901461b, RESULT_UN); + status |= test__comparedf2(0x7fefffffffffffff, 0xfffe2c1db2e4a313, RESULT_UN); + status |= test__comparedf2(0x7ff0000000000000, 0x0000000000000000, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0x0000000000000001, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0x000fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0x7fe0000000000000, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0x7fefffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0x7ff0000000000000, RESULT_EQ); + status |= test__comparedf2(0x7ff0000000000000, 0x7ff0e6d059ac9171, RESULT_UN); + status |= test__comparedf2(0x7ff0000000000000, 0x7ffbda2fc9024ae6, RESULT_UN); + status |= test__comparedf2(0x7ff0000000000000, 0x8000000000000000, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0x8000000000000001, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0x800fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0xffefffffffffffff, RESULT_GT); + status |= test__comparedf2(0x7ff0000000000000, 0xfff0000000000000, RESULT_GT); + status |= test__comparedf2(0x7ff0000047e8b9a0, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff4017647e8b9a0, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff00000abfe5d29, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ff2a1cdabfe5d29, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ff000005155db76, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff645cb5155db76, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff0000070c46aa0, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff2068470c46aa0, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff00000b5aee637, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff72b19b5aee637, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff00000c08c2788, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff1e0c1c08c2788, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff00000ec581a54, 0x7ff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff00000ec581a54, 0x7ff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff571eaec581a54, 0x7ff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff571eaec581a54, 0x7ff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff000003a3a1f94, 0x7ff00000229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff000003a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff6439e3a3a1f94, 0x7ff00000229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff6439e3a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff00000ec581a54, 0xfff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff00000ec581a54, 0xfff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff571eaec581a54, 0xfff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff571eaec581a54, 0xfff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0x7ff000003a3a1f94, 0xfff00000229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff000003a3a1f94, 0xfffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff6439e3a3a1f94, 0xfff00000229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff6439e3a3a1f94, 0xfffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0x7ff00000c31d528e, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff5fb72c31d528e, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff00000ac81d215, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ff4481aac81d215, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ff00000d12062fd, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff707f6d12062fd, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff000001c6481ef, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff66ee91c6481ef, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff00000985729a7, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff19cff985729a7, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff0000053ec80fe, 0xfff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff7dbc153ec80fe, 0xfff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff00000816fb493, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff87f75816fb493, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff000000c2d7c33, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ff91ecb0c2d7c33, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ff00000a68bae40, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ffc0acda68bae40, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff000002fe14961, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ffcfa4e2fe14961, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff000005c206da1, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff800bb5c206da1, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff0000051887a34, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ffce11951887a34, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff000002b4c32a8, 0x7ff000001edb8786, RESULT_UN); + status |= test__comparedf2(0x7ff000002b4c32a8, 0x7ff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0x7ff000001edb8786, RESULT_UN); + status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0x7ff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0x7ff00000bc88c2a9, 0x7ff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff00000bc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0x7ff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff000002b4c32a8, 0xfff000001edb8786, RESULT_UN); + status |= test__comparedf2(0x7ff000002b4c32a8, 0xfff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0xfff000001edb8786, RESULT_UN); + status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0xfff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0x7ff00000bc88c2a9, 0xfff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff00000bc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0xfff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0x7ff00000a47525ca, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ffcb028a47525ca, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff0000097c1af12, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ffc541e97c1af12, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0x7ff00000bb1c07a4, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff966b7bb1c07a4, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff000001d98f07c, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff9dbf61d98f07c, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff0000040e65504, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ffb2a7440e65504, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0x7ff00000d9dc7412, 0xfff0000000000000, RESULT_UN); + status |= test__comparedf2(0x7ff8af62d9dc7412, 0xfff0000000000000, RESULT_UN); + status |= test__comparedf2(0x8000000000000000, 0x0000000000000000, RESULT_EQ); + status |= test__comparedf2(0x8000000000000000, 0x0000000000000001, RESULT_LT); + status |= test__comparedf2(0x8000000000000000, 0x000fffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8000000000000000, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0x8000000000000000, 0x7ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x8000000000000000, 0x7ff000005a0faea3, RESULT_UN); + status |= test__comparedf2(0x8000000000000000, 0x7ff225cc5a0faea3, RESULT_UN); + status |= test__comparedf2(0x8000000000000000, 0x7ffa0cc436ad9daa, RESULT_UN); + status |= test__comparedf2(0x8000000000000000, 0x8000000000000001, RESULT_GT); + status |= test__comparedf2(0x8000000000000000, 0x800fffffffffffff, RESULT_GT); + status |= test__comparedf2(0x8000000000000000, 0x8010000000000000, RESULT_GT); + status |= test__comparedf2(0x8000000000000000, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x8000000000000000, 0xfff0000000000000, RESULT_GT); + status |= test__comparedf2(0x8000000000000000, 0xfff000005a0faea3, RESULT_UN); + status |= test__comparedf2(0x8000000000000000, 0xfff225cc5a0faea3, RESULT_UN); + status |= test__comparedf2(0x8000000000000000, 0xfffa0cc436ad9daa, RESULT_UN); + status |= test__comparedf2(0x8000000000000001, 0x0000000000000001, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x3fefffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x3ffffffffffffffe, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x3fffffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x7fdfffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x7feffffffffffffe, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x7fefffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8000000000000001, 0x7ff0000013fd5944, RESULT_UN); + status |= test__comparedf2(0x8000000000000001, 0x7ff4154313fd5944, RESULT_UN); + status |= test__comparedf2(0x8000000000000001, 0x7ffd397ba0f9b5e1, RESULT_UN); + status |= test__comparedf2(0x8000000000000001, 0x8000000000000001, RESULT_EQ); + status |= test__comparedf2(0x8000000000000001, 0xbfefffffffffffff, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xbff0000000000000, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xbffffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xbfffffffffffffff, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xffdfffffffffffff, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xffeffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xffefffffffffffff, RESULT_GT); + status |= test__comparedf2(0x8000000000000001, 0xfff0000013fd5944, RESULT_UN); + status |= test__comparedf2(0x8000000000000001, 0xfff4154313fd5944, RESULT_UN); + status |= test__comparedf2(0x8000000000000001, 0xfffd397ba0f9b5e1, RESULT_UN); + status |= test__comparedf2(0x8000000000000002, 0x8000000000000001, RESULT_LT); + status |= test__comparedf2(0x8000000000000003, 0x4008000000000000, RESULT_LT); + status |= test__comparedf2(0x8000000000000003, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0x8000000000000003, 0x8000000000000002, RESULT_LT); + status |= test__comparedf2(0x8000000000000003, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0x8000000000000004, 0x8000000000000004, RESULT_EQ); + status |= test__comparedf2(0x800ffffffffffffd, 0x800ffffffffffffe, RESULT_GT); + status |= test__comparedf2(0x800fffffffffffff, 0x0000000000000000, RESULT_LT); + status |= test__comparedf2(0x800fffffffffffff, 0x000fffffffffffff, RESULT_LT); + status |= test__comparedf2(0x800fffffffffffff, 0x7ff0000000000000, RESULT_LT); + status |= test__comparedf2(0x800fffffffffffff, 0x7ff00000a2b85efa, RESULT_UN); + status |= test__comparedf2(0x800fffffffffffff, 0x7ff1d4fba2b85efa, RESULT_UN); + status |= test__comparedf2(0x800fffffffffffff, 0x7ffd08c114a37fe6, RESULT_UN); + status |= test__comparedf2(0x800fffffffffffff, 0x8000000000000000, RESULT_LT); + status |= test__comparedf2(0x800fffffffffffff, 0x800ffffffffffffe, RESULT_LT); + status |= test__comparedf2(0x800fffffffffffff, 0x800fffffffffffff, RESULT_EQ); + status |= test__comparedf2(0x800fffffffffffff, 0x8010000000000000, RESULT_GT); + status |= test__comparedf2(0x800fffffffffffff, 0xfff0000000000000, RESULT_GT); + status |= test__comparedf2(0x800fffffffffffff, 0xfff00000a2b85efa, RESULT_UN); + status |= test__comparedf2(0x800fffffffffffff, 0xfff1d4fba2b85efa, RESULT_UN); + status |= test__comparedf2(0x800fffffffffffff, 0xfffd08c114a37fe6, RESULT_UN); + status |= test__comparedf2(0x8010000000000000, 0x0000000000000000, RESULT_LT); + status |= test__comparedf2(0x8010000000000000, 0x0010000000000000, RESULT_LT); + status |= test__comparedf2(0x8010000000000001, 0x8010000000000000, RESULT_LT); + status |= test__comparedf2(0x8010000000000001, 0x8010000000000002, RESULT_GT); + status |= test__comparedf2(0x801fffffffffffff, 0x8020000000000000, RESULT_GT); + status |= test__comparedf2(0x801fffffffffffff, 0x8020000000000002, RESULT_GT); + status |= test__comparedf2(0x801fffffffffffff, 0x8020000000000004, RESULT_GT); + status |= test__comparedf2(0x8020000000000000, 0x801fffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8020000000000001, 0x8010000000000001, RESULT_LT); + status |= test__comparedf2(0x8020000000000001, 0x801fffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8020000000000002, 0x8010000000000001, RESULT_LT); + status |= test__comparedf2(0x802fffffffffffff, 0x8030000000000000, RESULT_GT); + status |= test__comparedf2(0x8030000000000000, 0x802fffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8030000000000001, 0x802fffffffffffff, RESULT_LT); + status |= test__comparedf2(0x8030000000000002, 0x8020000000000003, RESULT_LT); + status |= test__comparedf2(0xbff0000000000000, 0x3ff0000000000003, RESULT_LT); + status |= test__comparedf2(0xbff0000000000000, 0x7ff000000d32ab76, RESULT_UN); + status |= test__comparedf2(0xbff0000000000000, 0x7ff3d46c0d32ab76, RESULT_UN); + status |= test__comparedf2(0xbff0000000000000, 0x7ffb51e7ffa1e86b, RESULT_UN); + status |= test__comparedf2(0xbff0000000000000, 0x8000000000000000, RESULT_LT); + status |= test__comparedf2(0xbff0000000000000, 0xbff0000000000003, RESULT_GT); + status |= test__comparedf2(0xbff0000000000000, 0xfff000000d32ab76, RESULT_UN); + status |= test__comparedf2(0xbff0000000000000, 0xfff3d46c0d32ab76, RESULT_UN); + status |= test__comparedf2(0xbff0000000000000, 0xfffb51e7ffa1e86b, RESULT_UN); + status |= test__comparedf2(0xbff0000000000001, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0xbff0000000000001, 0xbff0000000000000, RESULT_LT); + status |= test__comparedf2(0xbff0000000000001, 0xbff0000000000002, RESULT_GT); + status |= test__comparedf2(0xbffffffffffffffc, 0xbffffffffffffffd, RESULT_GT); + status |= test__comparedf2(0xbfffffffffffffff, 0x0000000000000001, RESULT_LT); + status |= test__comparedf2(0xbfffffffffffffff, 0xc000000000000000, RESULT_GT); + status |= test__comparedf2(0xc000000000000000, 0x4000000000000001, RESULT_LT); + status |= test__comparedf2(0xc000000000000000, 0xbfffffffffffffff, RESULT_LT); + status |= test__comparedf2(0xc000000000000000, 0xc000000000000001, RESULT_GT); + status |= test__comparedf2(0xc000000000000001, 0x4000000000000002, RESULT_LT); + status |= test__comparedf2(0xc000000000000001, 0xbff0000000000001, RESULT_LT); + status |= test__comparedf2(0xc000000000000001, 0xc000000000000002, RESULT_GT); + status |= test__comparedf2(0xc000000000000002, 0xbff0000000000001, RESULT_LT); + status |= test__comparedf2(0xc000000000000002, 0xbff0000000000003, RESULT_LT); + status |= test__comparedf2(0xc000000000000004, 0xc000000000000003, RESULT_LT); + status |= test__comparedf2(0xc008000000000000, 0x4008000000000000, RESULT_LT); + status |= test__comparedf2(0xc00fffffffffffff, 0xc00ffffffffffffe, RESULT_LT); + status |= test__comparedf2(0xc00fffffffffffff, 0xc010000000000002, RESULT_GT); + status |= test__comparedf2(0xc010000000000001, 0xc00fffffffffffff, RESULT_LT); + status |= test__comparedf2(0xffb0000000000001, 0xffafffffffffffff, RESULT_LT); + status |= test__comparedf2(0xffcfffffffffffff, 0xffcffffffffffffe, RESULT_LT); + status |= test__comparedf2(0xffcfffffffffffff, 0xffd0000000000002, RESULT_GT); + status |= test__comparedf2(0xffd0000000000000, 0xffcfffffffffffff, RESULT_LT); + status |= test__comparedf2(0xffd0000000000000, 0xffd0000000000001, RESULT_GT); + status |= test__comparedf2(0xffd0000000000001, 0x7fd0000000000000, RESULT_LT); + status |= test__comparedf2(0xffd0000000000001, 0xffd0000000000000, RESULT_LT); + status |= test__comparedf2(0xffd0000000000001, 0xffe0000000000001, RESULT_GT); + status |= test__comparedf2(0xffd0000000000002, 0xffc0000000000003, RESULT_LT); + status |= test__comparedf2(0xffd0000000000004, 0xffd0000000000003, RESULT_LT); + status |= test__comparedf2(0xffdffffffffffffe, 0x7fdffffffffffffe, RESULT_LT); + status |= test__comparedf2(0xffdffffffffffffe, 0x7fdfffffffffffff, RESULT_LT); + status |= test__comparedf2(0xffdffffffffffffe, 0xffdffffffffffffe, RESULT_EQ); + status |= test__comparedf2(0xffdffffffffffffe, 0xffdfffffffffffff, RESULT_GT); + status |= test__comparedf2(0xffdfffffffffffff, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffdfffffffffffff, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0xffdfffffffffffff, 0xbff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffdfffffffffffff, 0xffe0000000000000, RESULT_GT); + status |= test__comparedf2(0xffe0000000000000, 0x0000000000000000, RESULT_LT); + status |= test__comparedf2(0xffe0000000000000, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffe0000000000000, 0x7ff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffe0000000000000, 0x8000000000000000, RESULT_LT); + status |= test__comparedf2(0xffe0000000000000, 0xbff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffe0000000000000, 0xffe0000000000000, RESULT_EQ); + status |= test__comparedf2(0xffe0000000000000, 0xfff0000000000000, RESULT_GT); + status |= test__comparedf2(0xffe0000000000001, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0xffe0000000000001, 0xffe0000000000000, RESULT_LT); + status |= test__comparedf2(0xffe0000000000001, 0xffe0000000000002, RESULT_GT); + status |= test__comparedf2(0xffe0000000000002, 0xffd0000000000001, RESULT_LT); + status |= test__comparedf2(0xffeffffffffffffe, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffeffffffffffffe, 0x7fefffffffffffff, RESULT_LT); + status |= test__comparedf2(0xffeffffffffffffe, 0xbff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffeffffffffffffe, 0xffefffffffffffff, RESULT_GT); + status |= test__comparedf2(0xffefffffffffffff, 0x0000000000000001, RESULT_LT); + status |= test__comparedf2(0xffefffffffffffff, 0x3ff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffefffffffffffff, 0x7ff000007d4a42a6, RESULT_UN); + status |= test__comparedf2(0xffefffffffffffff, 0x7ff7252c7d4a42a6, RESULT_UN); + status |= test__comparedf2(0xffefffffffffffff, 0x7ff980ec6115c6fb, RESULT_UN); + status |= test__comparedf2(0xffefffffffffffff, 0x8000000000000001, RESULT_LT); + status |= test__comparedf2(0xffefffffffffffff, 0xbff0000000000000, RESULT_LT); + status |= test__comparedf2(0xffefffffffffffff, 0xffefffffffffffff, RESULT_EQ); + status |= test__comparedf2(0xffefffffffffffff, 0xfff000007d4a42a6, RESULT_UN); + status |= test__comparedf2(0xffefffffffffffff, 0xfff7252c7d4a42a6, RESULT_UN); + status |= test__comparedf2(0xffefffffffffffff, 0xfff980ec6115c6fb, RESULT_UN); + status |= test__comparedf2(0xfff0000000000000, 0x0000000000000000, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x0000000000000001, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x000fffffffffffff, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x7fe0000000000000, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x7fefffffffffffff, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x7ff0000000000000, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x7ff00000578bbe24, RESULT_UN); + status |= test__comparedf2(0xfff0000000000000, 0x7ff63d54578bbe24, RESULT_UN); + status |= test__comparedf2(0xfff0000000000000, 0x7ffbc66614390083, RESULT_UN); + status |= test__comparedf2(0xfff0000000000000, 0x8000000000000000, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x8000000000000001, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0x800fffffffffffff, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0xffe0000000000000, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0xffefffffffffffff, RESULT_LT); + status |= test__comparedf2(0xfff0000000000000, 0xfff0000000000000, RESULT_EQ); + status |= test__comparedf2(0xfff0000000000000, 0xfff00000578bbe24, RESULT_UN); + status |= test__comparedf2(0xfff0000000000000, 0xfff63d54578bbe24, RESULT_UN); + status |= test__comparedf2(0xfff0000000000000, 0xfffbc66614390083, RESULT_UN); + status |= test__comparedf2(0xfff0000047e8b9a0, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0xfff4017647e8b9a0, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0xfff00000abfe5d29, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0xfff2a1cdabfe5d29, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0xfff000005155db76, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff645cb5155db76, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff0000070c46aa0, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff2068470c46aa0, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff00000b5aee637, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff72b19b5aee637, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff00000c08c2788, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff1e0c1c08c2788, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff00000ec581a54, 0x7ff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff00000ec581a54, 0x7ff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff571eaec581a54, 0x7ff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff571eaec581a54, 0x7ff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff000003a3a1f94, 0x7ff00000229f3502, RESULT_UN); + status |= test__comparedf2(0xfff000003a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0xfff6439e3a3a1f94, 0x7ff00000229f3502, RESULT_UN); + status |= test__comparedf2(0xfff6439e3a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0xfff00000ec581a54, 0xfff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff00000ec581a54, 0xfff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff571eaec581a54, 0xfff0000021ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff571eaec581a54, 0xfff45d2221ebdfaf, RESULT_UN); + status |= test__comparedf2(0xfff000003a3a1f94, 0xfff00000229f3502, RESULT_UN); + status |= test__comparedf2(0xfff000003a3a1f94, 0xfffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0xfff6439e3a3a1f94, 0xfff00000229f3502, RESULT_UN); + status |= test__comparedf2(0xfff6439e3a3a1f94, 0xfffb8fa0229f3502, RESULT_UN); + status |= test__comparedf2(0xfff00000c31d528e, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0xfff5fb72c31d528e, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0xfff00000ac81d215, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0xfff4481aac81d215, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0xfff00000d12062fd, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff707f6d12062fd, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff000001c6481ef, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff66ee91c6481ef, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff00000985729a7, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff19cff985729a7, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff0000053ec80fe, 0xfff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff7dbc153ec80fe, 0xfff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff00000816fb493, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0xfff87f75816fb493, 0x0000000000000000, RESULT_UN); + status |= test__comparedf2(0xfff000000c2d7c33, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0xfff91ecb0c2d7c33, 0x0000000000000001, RESULT_UN); + status |= test__comparedf2(0xfff00000a68bae40, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfffc0acda68bae40, 0x000fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff000002fe14961, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfffcfa4e2fe14961, 0x3ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff000005c206da1, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff800bb5c206da1, 0x7fefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff0000051887a34, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfffce11951887a34, 0x7ff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff000002b4c32a8, 0x7ff000001edb8786, RESULT_UN); + status |= test__comparedf2(0xfff000002b4c32a8, 0x7ff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0xfffbd6b52b4c32a8, 0x7ff000001edb8786, RESULT_UN); + status |= test__comparedf2(0xfffbd6b52b4c32a8, 0x7ff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0xfff00000bc88c2a9, 0x7ff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff00000bc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff8eaadbc88c2a9, 0x7ff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff8eaadbc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff000002b4c32a8, 0xfff000001edb8786, RESULT_UN); + status |= test__comparedf2(0xfff000002b4c32a8, 0xfff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0xfffbd6b52b4c32a8, 0xfff000001edb8786, RESULT_UN); + status |= test__comparedf2(0xfffbd6b52b4c32a8, 0xfff342ea1edb8786, RESULT_UN); + status |= test__comparedf2(0xfff00000bc88c2a9, 0xfff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff00000bc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff8eaadbc88c2a9, 0xfff000002fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff8eaadbc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN); + status |= test__comparedf2(0xfff00000a47525ca, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0xfffcb028a47525ca, 0x8000000000000000, RESULT_UN); + status |= test__comparedf2(0xfff0000097c1af12, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0xfffc541e97c1af12, 0x8000000000000001, RESULT_UN); + status |= test__comparedf2(0xfff00000bb1c07a4, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff966b7bb1c07a4, 0x800fffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff000001d98f07c, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff9dbf61d98f07c, 0xbff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff0000040e65504, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfffb2a7440e65504, 0xffefffffffffffff, RESULT_UN); + status |= test__comparedf2(0xfff00000d9dc7412, 0xfff0000000000000, RESULT_UN); + status |= test__comparedf2(0xfff8af62d9dc7412, 0xfff0000000000000, RESULT_UN); + + return status; +} diff --git a/compiler-rt/test/builtins/Unit/comparesf2new_test.c b/compiler-rt/test/builtins/Unit/comparesf2new_test.c new file mode 100644 index 0000000000000..b5dfe2352958f --- /dev/null +++ b/compiler-rt/test/builtins/Unit/comparesf2new_test.c @@ -0,0 +1,443 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_comparesf2 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +COMPILER_RT_ABI int __eqsf2(float, float); +COMPILER_RT_ABI int __nesf2(float, float); +COMPILER_RT_ABI int __gesf2(float, float); +COMPILER_RT_ABI int __gtsf2(float, float); +COMPILER_RT_ABI int __lesf2(float, float); +COMPILER_RT_ABI int __ltsf2(float, float); +COMPILER_RT_ABI int __cmpsf2(float, float); +COMPILER_RT_ABI int __unordsf2(float, float); + +enum Result { RESULT_LT, RESULT_GT, RESULT_EQ, RESULT_UN }; + +int expect(uint32_t a_rep, uint32_t b_rep, const char *name, int result, int ok, + const char *expected, int line) { + if (!ok) + printf("error at line %d: %s(%08" PRIx32 ", %08" PRIx32 + ") = %d, expected %s\n", + line, name, a_rep, b_rep, result, expected); + return !ok; +} + +int test__comparesf2(uint32_t a_rep, uint32_t b_rep, enum Result result, + int line) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + + int eq = __eqsf2(a, b); + int ne = __nesf2(a, b); + int ge = __gesf2(a, b); + int gt = __gtsf2(a, b); + int le = __lesf2(a, b); + int lt = __ltsf2(a, b); +#ifdef __ELF__ + // The generic builtins/comparedf2.c does not define this function + // for object formats other than ELF + int cmp = __cmpsf2(a, b); +#endif + int unord = __unordsf2(a, b); + + int ret = 0; + + switch (result) { + case RESULT_LT: + ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__nesf2", ne, ne != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__gesf2", ge, ge < 0, "< 0", line); + ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__lesf2", le, le <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt < 0, "< 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == -1, "== -1", line); +#endif + ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 0, "== 0", line); + break; + case RESULT_GT: + ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__nesf2", ne, ne != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__gesf2", ge, ge >= 0, ">= 0", line); + ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt > 0, "> 0", line); + ret |= expect(a_rep, b_rep, "__lesf2", le, le > 0, "> 0", line); + ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt >= 0, ">= 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == 1, "== 1", line); +#endif + ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 0, "== 0", line); + break; + case RESULT_EQ: + ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq == 0, "== 0", line); + ret |= expect(a_rep, b_rep, "__nesf2", ne, ne == 0, "== 0", line); + ret |= expect(a_rep, b_rep, "__gesf2", ge, ge >= 0, ">= 0", line); + ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__lesf2", le, le <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt >= 0, ">= 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == 0, "== 0", line); +#endif + ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 0, "== 0", line); + break; + case RESULT_UN: + ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__nesf2", ne, ne != 0, "!= 0", line); + ret |= expect(a_rep, b_rep, "__gesf2", ge, ge < 0, "< 0", line); + ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt <= 0, "<= 0", line); + ret |= expect(a_rep, b_rep, "__lesf2", le, le > 0, "> 0", line); + ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt >= 0, ">= 0", line); +#ifdef __ELF__ + ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == 1, "== 1", line); +#endif + ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 1, "== 1", line); + break; + } + + return ret; +} + +#define test__comparesf2(a, b, x) test__comparesf2(a, b, x, __LINE__) + +int main(void) { + int status = 0; + + status |= test__comparesf2(0x00000000, 0x00000001, RESULT_LT); + status |= test__comparesf2(0x00000000, 0x007fffff, RESULT_LT); + status |= test__comparesf2(0x00000000, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0x00000000, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0x00000000, 0x7f800000, RESULT_LT); + status |= test__comparesf2(0x00000000, 0x7f872da0, RESULT_UN); + status |= test__comparesf2(0x00000000, 0x7fe42e09, RESULT_UN); + status |= test__comparesf2(0x00000000, 0x80000000, RESULT_EQ); + status |= test__comparesf2(0x00000000, 0x80000001, RESULT_GT); + status |= test__comparesf2(0x00000000, 0x807fffff, RESULT_GT); + status |= test__comparesf2(0x00000000, 0x80800000, RESULT_GT); + status |= test__comparesf2(0x00000000, 0xff800000, RESULT_GT); + status |= test__comparesf2(0x00000001, 0x00000001, RESULT_EQ); + status |= test__comparesf2(0x00000001, 0x3f7fffff, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x3ffffffe, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x3fffffff, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x7effffff, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x7f7ffffe, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x7f7fffff, RESULT_LT); + status |= test__comparesf2(0x00000001, 0x7f94d5b9, RESULT_UN); + status |= test__comparesf2(0x00000001, 0x7fef53b1, RESULT_UN); + status |= test__comparesf2(0x00000001, 0x80000001, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xbf7fffff, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xbffffffe, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xbfffffff, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xfeffffff, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xff7ffffe, RESULT_GT); + status |= test__comparesf2(0x00000001, 0xff7fffff, RESULT_GT); + status |= test__comparesf2(0x00000002, 0x00000001, RESULT_GT); + status |= test__comparesf2(0x00000003, 0x00000002, RESULT_GT); + status |= test__comparesf2(0x00000003, 0x40400000, RESULT_LT); + status |= test__comparesf2(0x00000003, 0x40a00000, RESULT_LT); + status |= test__comparesf2(0x00000003, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0x00000003, 0xc0a00000, RESULT_GT); + status |= test__comparesf2(0x00000003, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x00000004, 0x00000004, RESULT_EQ); + status |= test__comparesf2(0x007ffffc, 0x807ffffc, RESULT_GT); + status |= test__comparesf2(0x007ffffd, 0x007ffffe, RESULT_LT); + status |= test__comparesf2(0x007fffff, 0x00000000, RESULT_GT); + status |= test__comparesf2(0x007fffff, 0x007ffffe, RESULT_GT); + status |= test__comparesf2(0x007fffff, 0x007fffff, RESULT_EQ); + status |= test__comparesf2(0x007fffff, 0x00800000, RESULT_LT); + status |= test__comparesf2(0x007fffff, 0x7f800000, RESULT_LT); + status |= test__comparesf2(0x007fffff, 0x7fa111d3, RESULT_UN); + status |= test__comparesf2(0x007fffff, 0x7ff43134, RESULT_UN); + status |= test__comparesf2(0x007fffff, 0x80000000, RESULT_GT); + status |= test__comparesf2(0x007fffff, 0xff800000, RESULT_GT); + status |= test__comparesf2(0x00800000, 0x00000000, RESULT_GT); + status |= test__comparesf2(0x00800000, 0x00800000, RESULT_EQ); + status |= test__comparesf2(0x00800000, 0x80800000, RESULT_GT); + status |= test__comparesf2(0x00800001, 0x00800000, RESULT_GT); + status |= test__comparesf2(0x00800001, 0x00800002, RESULT_LT); + status |= test__comparesf2(0x00ffffff, 0x01000000, RESULT_LT); + status |= test__comparesf2(0x00ffffff, 0x01000002, RESULT_LT); + status |= test__comparesf2(0x00ffffff, 0x01000004, RESULT_LT); + status |= test__comparesf2(0x01000000, 0x00ffffff, RESULT_GT); + status |= test__comparesf2(0x01000001, 0x00800001, RESULT_GT); + status |= test__comparesf2(0x01000001, 0x00ffffff, RESULT_GT); + status |= test__comparesf2(0x01000002, 0x00800001, RESULT_GT); + status |= test__comparesf2(0x017fffff, 0x01800000, RESULT_LT); + status |= test__comparesf2(0x01800000, 0x017fffff, RESULT_GT); + status |= test__comparesf2(0x01800001, 0x017fffff, RESULT_GT); + status |= test__comparesf2(0x01800002, 0x01000003, RESULT_GT); + status |= test__comparesf2(0x3f000000, 0x3f000000, RESULT_EQ); + status |= test__comparesf2(0x3f7fffff, 0x00000001, RESULT_GT); + status |= test__comparesf2(0x3f7fffff, 0x80000001, RESULT_GT); + status |= test__comparesf2(0x3f800000, 0x3f800000, RESULT_EQ); + status |= test__comparesf2(0x3f800000, 0x3f800003, RESULT_LT); + status |= test__comparesf2(0x3f800000, 0x40000000, RESULT_LT); + status |= test__comparesf2(0x3f800000, 0x40e00000, RESULT_LT); + status |= test__comparesf2(0x3f800000, 0x7fb27f62, RESULT_UN); + status |= test__comparesf2(0x3f800000, 0x7fd9d4b4, RESULT_UN); + status |= test__comparesf2(0x3f800000, 0x80000000, RESULT_GT); + status |= test__comparesf2(0x3f800000, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x3f800000, 0xbf800003, RESULT_GT); + status |= test__comparesf2(0x3f800001, 0x3f800000, RESULT_GT); + status |= test__comparesf2(0x3f800001, 0x3f800002, RESULT_LT); + status |= test__comparesf2(0x3f800001, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x3ffffffc, 0x3ffffffd, RESULT_LT); + status |= test__comparesf2(0x3fffffff, 0x00000001, RESULT_GT); + status |= test__comparesf2(0x3fffffff, 0x40000000, RESULT_LT); + status |= test__comparesf2(0x40000000, 0x3f800000, RESULT_GT); + status |= test__comparesf2(0x40000000, 0x3fffffff, RESULT_GT); + status |= test__comparesf2(0x40000000, 0x40000000, RESULT_EQ); + status |= test__comparesf2(0x40000000, 0x40000001, RESULT_LT); + status |= test__comparesf2(0x40000000, 0xc0000000, RESULT_GT); + status |= test__comparesf2(0x40000000, 0xc0000001, RESULT_GT); + status |= test__comparesf2(0x40000000, 0xc0a00000, RESULT_GT); + status |= test__comparesf2(0x40000001, 0x3f800001, RESULT_GT); + status |= test__comparesf2(0x40000001, 0x40000002, RESULT_LT); + status |= test__comparesf2(0x40000001, 0xc0000002, RESULT_GT); + status |= test__comparesf2(0x40000002, 0x3f800001, RESULT_GT); + status |= test__comparesf2(0x40000002, 0x3f800003, RESULT_GT); + status |= test__comparesf2(0x40000004, 0x40000003, RESULT_GT); + status |= test__comparesf2(0x40400000, 0x40400000, RESULT_EQ); + status |= test__comparesf2(0x407fffff, 0x407ffffe, RESULT_GT); + status |= test__comparesf2(0x407fffff, 0x40800002, RESULT_LT); + status |= test__comparesf2(0x40800001, 0x407fffff, RESULT_GT); + status |= test__comparesf2(0x40a00000, 0x00000000, RESULT_GT); + status |= test__comparesf2(0x40a00000, 0x80000000, RESULT_GT); + status |= test__comparesf2(0x40a00000, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x40a00000, 0xc0a00000, RESULT_GT); + status |= test__comparesf2(0x7d800001, 0x7d7fffff, RESULT_GT); + status |= test__comparesf2(0x7e7fffff, 0x7e7ffffe, RESULT_GT); + status |= test__comparesf2(0x7e7fffff, 0x7e800002, RESULT_LT); + status |= test__comparesf2(0x7e800000, 0x7e7fffff, RESULT_GT); + status |= test__comparesf2(0x7e800000, 0x7e800000, RESULT_EQ); + status |= test__comparesf2(0x7e800000, 0x7e800001, RESULT_LT); + status |= test__comparesf2(0x7e800001, 0x7e800000, RESULT_GT); + status |= test__comparesf2(0x7e800001, 0x7f000001, RESULT_LT); + status |= test__comparesf2(0x7e800001, 0xfe800000, RESULT_GT); + status |= test__comparesf2(0x7e800002, 0x7e000003, RESULT_GT); + status |= test__comparesf2(0x7e800004, 0x7e800003, RESULT_GT); + status |= test__comparesf2(0x7efffffe, 0x7efffffe, RESULT_EQ); + status |= test__comparesf2(0x7efffffe, 0x7effffff, RESULT_LT); + status |= test__comparesf2(0x7efffffe, 0xfeffffff, RESULT_GT); + status |= test__comparesf2(0x7effffff, 0x3f800000, RESULT_GT); + status |= test__comparesf2(0x7effffff, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0x7effffff, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x7effffff, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x7f000000, 0x3f800000, RESULT_GT); + status |= test__comparesf2(0x7f000000, 0x7f000000, RESULT_EQ); + status |= test__comparesf2(0x7f000000, 0x7f800000, RESULT_LT); + status |= test__comparesf2(0x7f000000, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x7f000000, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x7f000000, 0xff800000, RESULT_GT); + status |= test__comparesf2(0x7f000001, 0x7f000000, RESULT_GT); + status |= test__comparesf2(0x7f000001, 0x7f000002, RESULT_LT); + status |= test__comparesf2(0x7f000001, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x7f000002, 0x7e800001, RESULT_GT); + status |= test__comparesf2(0x7f7ffffe, 0x3f800000, RESULT_GT); + status |= test__comparesf2(0x7f7ffffe, 0x7f7fffff, RESULT_LT); + status |= test__comparesf2(0x7f7ffffe, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x7f7ffffe, 0xff7fffff, RESULT_GT); + status |= test__comparesf2(0x7f7fffff, 0x00000001, RESULT_GT); + status |= test__comparesf2(0x7f7fffff, 0x3f800000, RESULT_GT); + status |= test__comparesf2(0x7f7fffff, 0x7f7fffff, RESULT_EQ); + status |= test__comparesf2(0x7f7fffff, 0x7fbed1eb, RESULT_UN); + status |= test__comparesf2(0x7f7fffff, 0x7fe15ee3, RESULT_UN); + status |= test__comparesf2(0x7f7fffff, 0x80000001, RESULT_GT); + status |= test__comparesf2(0x7f7fffff, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x00000000, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x00000001, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x007fffff, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x7f000000, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x7f7fffff, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x7f800000, RESULT_EQ); + status |= test__comparesf2(0x7f800000, 0x7f91a4da, RESULT_UN); + status |= test__comparesf2(0x7f800000, 0x7fd44a09, RESULT_UN); + status |= test__comparesf2(0x7f800000, 0x80000000, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x80000001, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0x807fffff, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0xff7fffff, RESULT_GT); + status |= test__comparesf2(0x7f800000, 0xff800000, RESULT_GT); + status |= test__comparesf2(0x7f86d066, 0x00000000, RESULT_UN); + status |= test__comparesf2(0x7f85a878, 0x00000001, RESULT_UN); + status |= test__comparesf2(0x7f8c0dca, 0x007fffff, RESULT_UN); + status |= test__comparesf2(0x7f822725, 0x3f800000, RESULT_UN); + status |= test__comparesf2(0x7f853870, 0x7f7fffff, RESULT_UN); + status |= test__comparesf2(0x7fbefc9d, 0x7f800000, RESULT_UN); + status |= test__comparesf2(0x7f9f84a9, 0x7f81461b, RESULT_UN); + status |= test__comparesf2(0x7f9e2c1d, 0x7fe4a313, RESULT_UN); + status |= test__comparesf2(0x7fb0e6d0, 0x80000000, RESULT_UN); + status |= test__comparesf2(0x7fac9171, 0x80000001, RESULT_UN); + status |= test__comparesf2(0x7f824ae6, 0x807fffff, RESULT_UN); + status |= test__comparesf2(0x7fa8b9a0, 0xbf800000, RESULT_UN); + status |= test__comparesf2(0x7f92a1cd, 0xff7fffff, RESULT_UN); + status |= test__comparesf2(0x7fbe5d29, 0xff800000, RESULT_UN); + status |= test__comparesf2(0x7fcc9a57, 0x00000000, RESULT_UN); + status |= test__comparesf2(0x7fec9d71, 0x00000001, RESULT_UN); + status |= test__comparesf2(0x7fd5db76, 0x007fffff, RESULT_UN); + status |= test__comparesf2(0x7fd003d9, 0x3f800000, RESULT_UN); + status |= test__comparesf2(0x7fca0684, 0x7f7fffff, RESULT_UN); + status |= test__comparesf2(0x7fc46aa0, 0x7f800000, RESULT_UN); + status |= test__comparesf2(0x7ff72b19, 0x7faee637, RESULT_UN); + status |= test__comparesf2(0x7fe9e0c1, 0x7fcc2788, RESULT_UN); + status |= test__comparesf2(0x7fc571ea, 0x80000000, RESULT_UN); + status |= test__comparesf2(0x7fd81a54, 0x80000001, RESULT_UN); + status |= test__comparesf2(0x7febdfaf, 0x807fffff, RESULT_UN); + status |= test__comparesf2(0x7ffa1f94, 0xbf800000, RESULT_UN); + status |= test__comparesf2(0x7ff38fa0, 0xff7fffff, RESULT_UN); + status |= test__comparesf2(0x7fdf3502, 0xff800000, RESULT_UN); + status |= test__comparesf2(0x80000000, 0x00000000, RESULT_EQ); + status |= test__comparesf2(0x80000000, 0x00000001, RESULT_LT); + status |= test__comparesf2(0x80000000, 0x007fffff, RESULT_LT); + status |= test__comparesf2(0x80000000, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0x80000000, 0x7f800000, RESULT_LT); + status |= test__comparesf2(0x80000000, 0x7fbdfb72, RESULT_UN); + status |= test__comparesf2(0x80000000, 0x7fdd528e, RESULT_UN); + status |= test__comparesf2(0x80000000, 0x80000001, RESULT_GT); + status |= test__comparesf2(0x80000000, 0x807fffff, RESULT_GT); + status |= test__comparesf2(0x80000000, 0x80800000, RESULT_GT); + status |= test__comparesf2(0x80000000, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x80000000, 0xff800000, RESULT_GT); + status |= test__comparesf2(0x80000001, 0x00000001, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x3f7fffff, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x3ffffffe, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x3fffffff, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x7effffff, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x7f7ffffe, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x7f7fffff, RESULT_LT); + status |= test__comparesf2(0x80000001, 0x7fac481a, RESULT_UN); + status |= test__comparesf2(0x80000001, 0x7fcf111d, RESULT_UN); + status |= test__comparesf2(0x80000001, 0x80000001, RESULT_EQ); + status |= test__comparesf2(0x80000001, 0xbf7fffff, RESULT_GT); + status |= test__comparesf2(0x80000001, 0xbf800000, RESULT_GT); + status |= test__comparesf2(0x80000001, 0xbffffffe, RESULT_GT); + status |= test__comparesf2(0x80000001, 0xbfffffff, RESULT_GT); + status |= test__comparesf2(0x80000001, 0xfeffffff, RESULT_GT); + status |= test__comparesf2(0x80000001, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x80000001, 0xff7ffffe, RESULT_GT); + status |= test__comparesf2(0x80000001, 0xff7fffff, RESULT_GT); + status |= test__comparesf2(0x80000002, 0x80000001, RESULT_LT); + status |= test__comparesf2(0x80000003, 0x40400000, RESULT_LT); + status |= test__comparesf2(0x80000003, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0x80000003, 0x80000002, RESULT_LT); + status |= test__comparesf2(0x80000003, 0xff000000, RESULT_GT); + status |= test__comparesf2(0x80000004, 0x80000004, RESULT_EQ); + status |= test__comparesf2(0x807ffffd, 0x807ffffe, RESULT_GT); + status |= test__comparesf2(0x807fffff, 0x00000000, RESULT_LT); + status |= test__comparesf2(0x807fffff, 0x007fffff, RESULT_LT); + status |= test__comparesf2(0x807fffff, 0x7f800000, RESULT_LT); + status |= test__comparesf2(0x807fffff, 0x7faf07f6, RESULT_UN); + status |= test__comparesf2(0x807fffff, 0x7fd18a54, RESULT_UN); + status |= test__comparesf2(0x807fffff, 0x80000000, RESULT_LT); + status |= test__comparesf2(0x807fffff, 0x807ffffe, RESULT_LT); + status |= test__comparesf2(0x807fffff, 0x807fffff, RESULT_EQ); + status |= test__comparesf2(0x807fffff, 0x80800000, RESULT_GT); + status |= test__comparesf2(0x807fffff, 0xff800000, RESULT_GT); + status |= test__comparesf2(0x80800000, 0x00000000, RESULT_LT); + status |= test__comparesf2(0x80800000, 0x00800000, RESULT_LT); + status |= test__comparesf2(0x80800001, 0x80800000, RESULT_LT); + status |= test__comparesf2(0x80800001, 0x80800002, RESULT_GT); + status |= test__comparesf2(0x80ffffff, 0x81000000, RESULT_GT); + status |= test__comparesf2(0x80ffffff, 0x81000002, RESULT_GT); + status |= test__comparesf2(0x80ffffff, 0x81000004, RESULT_GT); + status |= test__comparesf2(0x81000000, 0x80ffffff, RESULT_LT); + status |= test__comparesf2(0x81000001, 0x80800001, RESULT_LT); + status |= test__comparesf2(0x81000001, 0x80ffffff, RESULT_LT); + status |= test__comparesf2(0x81000002, 0x80800001, RESULT_LT); + status |= test__comparesf2(0x817fffff, 0x81800000, RESULT_GT); + status |= test__comparesf2(0x81800000, 0x817fffff, RESULT_LT); + status |= test__comparesf2(0x81800001, 0x817fffff, RESULT_LT); + status |= test__comparesf2(0x81800002, 0x81000003, RESULT_LT); + status |= test__comparesf2(0xbf800000, 0x3f800003, RESULT_LT); + status |= test__comparesf2(0xbf800000, 0x7fa66ee9, RESULT_UN); + status |= test__comparesf2(0xbf800000, 0x7fe481ef, RESULT_UN); + status |= test__comparesf2(0xbf800000, 0x80000000, RESULT_LT); + status |= test__comparesf2(0xbf800000, 0xbf800003, RESULT_GT); + status |= test__comparesf2(0xbf800001, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0xbf800001, 0xbf800000, RESULT_LT); + status |= test__comparesf2(0xbf800001, 0xbf800002, RESULT_GT); + status |= test__comparesf2(0xbffffffc, 0xbffffffd, RESULT_GT); + status |= test__comparesf2(0xbfffffff, 0x00000001, RESULT_LT); + status |= test__comparesf2(0xbfffffff, 0xc0000000, RESULT_GT); + status |= test__comparesf2(0xc0000000, 0x40000001, RESULT_LT); + status |= test__comparesf2(0xc0000000, 0xbfffffff, RESULT_LT); + status |= test__comparesf2(0xc0000000, 0xc0000001, RESULT_GT); + status |= test__comparesf2(0xc0000001, 0x40000002, RESULT_LT); + status |= test__comparesf2(0xc0000001, 0xbf800001, RESULT_LT); + status |= test__comparesf2(0xc0000001, 0xc0000002, RESULT_GT); + status |= test__comparesf2(0xc0000002, 0xbf800001, RESULT_LT); + status |= test__comparesf2(0xc0000002, 0xbf800003, RESULT_LT); + status |= test__comparesf2(0xc0000004, 0xc0000003, RESULT_LT); + status |= test__comparesf2(0xc0400000, 0x40400000, RESULT_LT); + status |= test__comparesf2(0xc07fffff, 0xc07ffffe, RESULT_LT); + status |= test__comparesf2(0xc07fffff, 0xc0800002, RESULT_GT); + status |= test__comparesf2(0xc0800001, 0xc07fffff, RESULT_LT); + status |= test__comparesf2(0xfd800001, 0xfd7fffff, RESULT_LT); + status |= test__comparesf2(0xfe7fffff, 0xfe7ffffe, RESULT_LT); + status |= test__comparesf2(0xfe7fffff, 0xfe800002, RESULT_GT); + status |= test__comparesf2(0xfe800000, 0xfe7fffff, RESULT_LT); + status |= test__comparesf2(0xfe800000, 0xfe800001, RESULT_GT); + status |= test__comparesf2(0xfe800001, 0x7e800000, RESULT_LT); + status |= test__comparesf2(0xfe800001, 0xfe800000, RESULT_LT); + status |= test__comparesf2(0xfe800001, 0xff000001, RESULT_GT); + status |= test__comparesf2(0xfe800002, 0xfe000003, RESULT_LT); + status |= test__comparesf2(0xfe800004, 0xfe800003, RESULT_LT); + status |= test__comparesf2(0xfefffffe, 0x7efffffe, RESULT_LT); + status |= test__comparesf2(0xfefffffe, 0x7effffff, RESULT_LT); + status |= test__comparesf2(0xfefffffe, 0xfefffffe, RESULT_EQ); + status |= test__comparesf2(0xfefffffe, 0xfeffffff, RESULT_GT); + status |= test__comparesf2(0xfeffffff, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0xfeffffff, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0xfeffffff, 0xbf800000, RESULT_LT); + status |= test__comparesf2(0xfeffffff, 0xff000000, RESULT_GT); + status |= test__comparesf2(0xff000000, 0x00000000, RESULT_LT); + status |= test__comparesf2(0xff000000, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0xff000000, 0x7f800000, RESULT_LT); + status |= test__comparesf2(0xff000000, 0x80000000, RESULT_LT); + status |= test__comparesf2(0xff000000, 0xbf800000, RESULT_LT); + status |= test__comparesf2(0xff000000, 0xff000000, RESULT_EQ); + status |= test__comparesf2(0xff000000, 0xff800000, RESULT_GT); + status |= test__comparesf2(0xff000001, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0xff000001, 0xff000000, RESULT_LT); + status |= test__comparesf2(0xff000001, 0xff000002, RESULT_GT); + status |= test__comparesf2(0xff000002, 0xfe800001, RESULT_LT); + status |= test__comparesf2(0xff7ffffe, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0xff7ffffe, 0x7f7fffff, RESULT_LT); + status |= test__comparesf2(0xff7ffffe, 0xbf800000, RESULT_LT); + status |= test__comparesf2(0xff7ffffe, 0xff7fffff, RESULT_GT); + status |= test__comparesf2(0xff7fffff, 0x00000001, RESULT_LT); + status |= test__comparesf2(0xff7fffff, 0x3f800000, RESULT_LT); + status |= test__comparesf2(0xff7fffff, 0x7f919cff, RESULT_UN); + status |= test__comparesf2(0xff7fffff, 0x7fd729a7, RESULT_UN); + status |= test__comparesf2(0xff7fffff, 0x80000001, RESULT_LT); + status |= test__comparesf2(0xff7fffff, 0xbf800000, RESULT_LT); + status |= test__comparesf2(0xff7fffff, 0xff7fffff, RESULT_EQ); + status |= test__comparesf2(0xff800000, 0x00000000, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x00000001, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x007fffff, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x7f000000, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x7f7fffff, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x7f800000, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x7fafdbc1, RESULT_UN); + status |= test__comparesf2(0xff800000, 0x7fec80fe, RESULT_UN); + status |= test__comparesf2(0xff800000, 0x80000000, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x80000001, RESULT_LT); + status |= test__comparesf2(0xff800000, 0x807fffff, RESULT_LT); + status |= test__comparesf2(0xff800000, 0xff000000, RESULT_LT); + status |= test__comparesf2(0xff800000, 0xff7fffff, RESULT_LT); + status |= test__comparesf2(0xff800000, 0xff800000, RESULT_EQ); + + return status; +} diff --git a/compiler-rt/test/builtins/Unit/divdf3new_test.c b/compiler-rt/test/builtins/Unit/divdf3new_test.c new file mode 100644 index 0000000000000..866c7cb08e519 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/divdf3new_test.c @@ -0,0 +1,862 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_divdf3 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +// By default this test uses compareResultD to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7ff8000000000000. For the Arm optimized FP implementation, which commits +// to a more detailed handling of NaNs, we tighten up the check and include +// some extra test cases specific to that NaN policy. +#if COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + +// Returns: a / b +COMPILER_RT_ABI double __divdf3(double a, double b); + +int test__divdf3(uint64_t a_rep, uint64_t b_rep, uint64_t expected_rep, + int line) { + double a = fromRep64(a_rep), b = fromRep64(b_rep); + double x = __divdf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep64(x) != expected_rep; +#else + int ret = compareResultD(x, expected_rep); +#endif + + if (ret) { + printf("error at line %d: __divdf3(%016" PRIx64 ", %016" PRIx64 + ") = %016" PRIx64 ", expected %016" PRIx64 "\n", + line, a_rep, b_rep, toRep64(x), expected_rep); + } + return ret; +} + +#define test__divdf3(a, b, x) test__divdf3(a, b, x, __LINE__) + +int main(void) { + int status = 0; + + status |= + test__divdf3(0x0000000000000000, 0x0000000000000001, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x000fffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x0010000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x001fffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x3ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x4014000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x7fdfffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x7fe0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x8000000000000002, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x800fffffffffffff, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x8010000000000001, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0x8020000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0xc008000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0xc01c000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0xffcfffffffffffff, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0xffe0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000000, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000001, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x0000000000000001, 0x3fc0000000000000, 0x0000000000000008); + status |= + test__divdf3(0x0000000000000001, 0x3fe0000000000000, 0x0000000000000002); + status |= + test__divdf3(0x0000000000000001, 0x4000000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000001, 0x7fefffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000001, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0000000000000001, 0xc000000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000001, 0xffefffffffffffff, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000002, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x0000000000000002, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0000000000000009, 0x4022000000000000, 0x0000000000000001); + status |= + test__divdf3(0x0000000000000009, 0xc022000000000000, 0x8000000000000001); + status |= + test__divdf3(0x000ffffffffffff7, 0x3feffffffffffffe, 0x000ffffffffffff8); + status |= + test__divdf3(0x000ffffffffffffe, 0x3feffffffffffffe, 0x000fffffffffffff); + status |= + test__divdf3(0x000fffffffffffff, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x000fffffffffffff, 0x3f60000000000000, 0x009ffffffffffffe); + status |= + test__divdf3(0x000fffffffffffff, 0x3fe0000000000000, 0x001ffffffffffffe); + status |= + test__divdf3(0x000fffffffffffff, 0x3ff0000000000000, 0x000fffffffffffff); + status |= + test__divdf3(0x000fffffffffffff, 0x3ff0000000000002, 0x000ffffffffffffd); + status |= + test__divdf3(0x000fffffffffffff, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x000fffffffffffff, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x000fffffffffffff, 0xbff0000000000000, 0x800fffffffffffff); + status |= + test__divdf3(0x000fffffffffffff, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0010000000000000, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x0010000000000000, 0x3ff0000000000001, 0x000fffffffffffff); + status |= + test__divdf3(0x0010000000000000, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0010000000000001, 0x3ff0000000000002, 0x000fffffffffffff); + status |= + test__divdf3(0x0010000000000001, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x0010000000000001, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0010000000000002, 0x3ff0000000000006, 0x000ffffffffffffc); + status |= + test__divdf3(0x001ffffffffffffe, 0x4000000000000000, 0x000fffffffffffff); + status |= + test__divdf3(0x001fffffffffffff, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x001fffffffffffff, 0x4000000000000000, 0x0010000000000000); + status |= + test__divdf3(0x001fffffffffffff, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x0020000000000000, 0x0010000000000000, 0x4000000000000000); + status |= + test__divdf3(0x0020000000000000, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x0020000000000000, 0xc000000000000000, 0x8010000000000000); + status |= + test__divdf3(0x0020000000000000, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x0020000000000001, 0x0010000000000001, 0x4000000000000000); + status |= + test__divdf3(0x0020000000000001, 0xc000000000000000, 0x8010000000000001); + status |= + test__divdf3(0x0020000000000003, 0x8010000000000003, 0xc000000000000000); + status |= + test__divdf3(0x0020000000000003, 0xc000000000000000, 0x8010000000000003); + status |= + test__divdf3(0x3feffffffffffff7, 0x3feffffffffffffb, 0x3feffffffffffffc); + status |= + test__divdf3(0x3feffffffffffff7, 0x3feffffffffffffe, 0x3feffffffffffff9); + status |= + test__divdf3(0x3feffffffffffff8, 0x3feffffffffffffc, 0x3feffffffffffffc); + status |= + test__divdf3(0x3feffffffffffff8, 0x3feffffffffffffd, 0x3feffffffffffffb); + status |= + test__divdf3(0x3feffffffffffffa, 0x3feffffffffffff9, 0x3ff0000000000001); + status |= + test__divdf3(0x3feffffffffffffb, 0x3feffffffffffff9, 0x3ff0000000000001); + status |= + test__divdf3(0x3feffffffffffffc, 0x3feffffffffffff9, 0x3ff0000000000002); + status |= + test__divdf3(0x3feffffffffffffc, 0x3feffffffffffffd, 0x3fefffffffffffff); + status |= + test__divdf3(0x3feffffffffffffc, 0x3feffffffffffffe, 0x3feffffffffffffe); + status |= + test__divdf3(0x3feffffffffffffc, 0x3fefffffffffffff, 0x3feffffffffffffd); + status |= + test__divdf3(0x3feffffffffffffc, 0x3ff0000000000001, 0x3feffffffffffffa); + status |= + test__divdf3(0x3feffffffffffffd, 0x3feffffffffffff9, 0x3ff0000000000002); + status |= + test__divdf3(0x3feffffffffffffd, 0x3feffffffffffffc, 0x3ff0000000000001); + status |= + test__divdf3(0x3feffffffffffffd, 0x3feffffffffffffe, 0x3fefffffffffffff); + status |= + test__divdf3(0x3feffffffffffffd, 0x3fefffffffffffff, 0x3feffffffffffffe); + status |= + test__divdf3(0x3feffffffffffffd, 0x3ff0000000000001, 0x3feffffffffffffb); + status |= + test__divdf3(0x3feffffffffffffd, 0x3ff0000000000002, 0x3feffffffffffff9); + status |= + test__divdf3(0x3feffffffffffffe, 0x3feffffffffffff9, 0x3ff0000000000003); + status |= + test__divdf3(0x3feffffffffffffe, 0x3feffffffffffffc, 0x3ff0000000000001); + status |= + test__divdf3(0x3feffffffffffffe, 0x3feffffffffffffd, 0x3ff0000000000001); + status |= + test__divdf3(0x3feffffffffffffe, 0x3fefffffffffffff, 0x3fefffffffffffff); + status |= + test__divdf3(0x3feffffffffffffe, 0x3ff0000000000001, 0x3feffffffffffffc); + status |= + test__divdf3(0x3feffffffffffffe, 0x3ff0000000000002, 0x3feffffffffffffa); + status |= + test__divdf3(0x3feffffffffffffe, 0x3ff0000000000003, 0x3feffffffffffff8); + status |= + test__divdf3(0x3fefffffffffffff, 0x3feffffffffffff9, 0x3ff0000000000003); + status |= + test__divdf3(0x3fefffffffffffff, 0x3feffffffffffffc, 0x3ff0000000000002); + status |= + test__divdf3(0x3fefffffffffffff, 0x3feffffffffffffd, 0x3ff0000000000001); + status |= + test__divdf3(0x3fefffffffffffff, 0x3feffffffffffffe, 0x3ff0000000000001); + status |= + test__divdf3(0x3fefffffffffffff, 0x3ff0000000000001, 0x3feffffffffffffd); + status |= + test__divdf3(0x3fefffffffffffff, 0x3ff0000000000002, 0x3feffffffffffffb); + status |= + test__divdf3(0x3fefffffffffffff, 0x3ff0000000000003, 0x3feffffffffffff9); + status |= + test__divdf3(0x3fefffffffffffff, 0x3ff0000000000004, 0x3feffffffffffff7); + status |= + test__divdf3(0x3ff0000000000000, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x3ff0000000000000, 0x3feffffffffffff7, 0x3ff0000000000005); + status |= + test__divdf3(0x3ff0000000000000, 0x3feffffffffffff8, 0x3ff0000000000004); + status |= + test__divdf3(0x3ff0000000000000, 0x3feffffffffffffb, 0x3ff0000000000003); + status |= + test__divdf3(0x3ff0000000000000, 0x3feffffffffffffc, 0x3ff0000000000002); + status |= + test__divdf3(0x3ff0000000000000, 0x3feffffffffffffd, 0x3ff0000000000002); + status |= + test__divdf3(0x3ff0000000000000, 0x3feffffffffffffe, 0x3ff0000000000001); + status |= + test__divdf3(0x3ff0000000000000, 0x3fefffffffffffff, 0x3ff0000000000001); + status |= + test__divdf3(0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000); + status |= + test__divdf3(0x3ff0000000000000, 0x3ff0000000000001, 0x3feffffffffffffe); + status |= + test__divdf3(0x3ff0000000000000, 0x3ff0000000000002, 0x3feffffffffffffc); + status |= + test__divdf3(0x3ff0000000000000, 0x3ff0000000000003, 0x3feffffffffffffa); + status |= + test__divdf3(0x3ff0000000000000, 0x3ff0000000000004, 0x3feffffffffffff8); + status |= + test__divdf3(0x3ff0000000000000, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x3ff0000000000001, 0x3feffffffffffffb, 0x3ff0000000000004); + status |= + test__divdf3(0x3ff0000000000001, 0x3feffffffffffffd, 0x3ff0000000000003); + status |= + test__divdf3(0x3ff0000000000001, 0x3feffffffffffffe, 0x3ff0000000000002); + status |= + test__divdf3(0x3ff0000000000001, 0x3fefffffffffffff, 0x3ff0000000000002); + status |= + test__divdf3(0x3ff0000000000001, 0x3ff0000000000002, 0x3feffffffffffffe); + status |= + test__divdf3(0x3ff0000000000001, 0x3ff0000000000003, 0x3feffffffffffffc); + status |= + test__divdf3(0x3ff0000000000002, 0x3feffffffffffffc, 0x3ff0000000000004); + status |= + test__divdf3(0x3ff0000000000002, 0x3feffffffffffffd, 0x3ff0000000000004); + status |= + test__divdf3(0x3ff0000000000002, 0x3feffffffffffffe, 0x3ff0000000000003); + status |= + test__divdf3(0x3ff0000000000002, 0x3fefffffffffffff, 0x3ff0000000000003); + status |= + test__divdf3(0x3ff0000000000002, 0x3ff0000000000001, 0x3ff0000000000001); + status |= + test__divdf3(0x3ff0000000000002, 0x3ff0000000000003, 0x3feffffffffffffe); + status |= + test__divdf3(0x3ff0000000000003, 0x3feffffffffffffd, 0x3ff0000000000005); + status |= + test__divdf3(0x3ff0000000000003, 0x3feffffffffffffe, 0x3ff0000000000004); + status |= + test__divdf3(0x3ff0000000000003, 0x3fefffffffffffff, 0x3ff0000000000004); + status |= + test__divdf3(0x3ff0000000000003, 0x3ff0000000000001, 0x3ff0000000000002); + status |= + test__divdf3(0x3ff0000000000004, 0x3feffffffffffffe, 0x3ff0000000000005); + status |= + test__divdf3(0x3ff0000000000004, 0x3ff0000000000001, 0x3ff0000000000003); + status |= + test__divdf3(0x3ff0000000000004, 0x3ff0000000000007, 0x3feffffffffffffa); + status |= + test__divdf3(0x3ff0000000000005, 0x3fefffffffffffff, 0x3ff0000000000006); + status |= + test__divdf3(0x3ff0000000000006, 0x3ff0000000000008, 0x3feffffffffffffc); + status |= + test__divdf3(0x3ff0000000000007, 0x3ff0000000000002, 0x3ff0000000000005); + status |= + test__divdf3(0x3ff0000000000009, 0x3ff0000000000008, 0x3ff0000000000001); + status |= + test__divdf3(0x3ff199999999999a, 0x3ff3333333333333, 0x3fed555555555556); + status |= + test__divdf3(0x4000000000000000, 0x3ff0000000000000, 0x4000000000000000); + status |= + test__divdf3(0x4000000000000000, 0xbff0000000000000, 0xc000000000000000); + status |= + test__divdf3(0x4008000000000000, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x4008000000000000, 0xc008000000000000, 0xbff0000000000000); + status |= + test__divdf3(0x4008000000000000, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x4014000000000000, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x4014000000000000, 0x4014000000000000, 0x3ff0000000000000); + status |= + test__divdf3(0x4014000000000000, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x401c000000000000, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x401c000000000000, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x4020000000000000, 0x4000000000000000, 0x4010000000000000); + status |= + test__divdf3(0x4022000000000000, 0x4008000000000000, 0x4008000000000000); + status |= + test__divdf3(0x7f60000000000000, 0x00a0000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7fcfffffffffffff, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7fdffffffffffffd, 0xc000000000000000, 0xffcffffffffffffd); + status |= + test__divdf3(0x7fdfffffffffffff, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7fdfffffffffffff, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0x000fffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0x3fe0000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0x4000000000000000, 0x7fd0000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0x7ff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0xbfe0000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0xc000000000000000, 0xffd0000000000000); + status |= + test__divdf3(0x7fe0000000000000, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x7fe0000000000003, 0xffd0000000000003, 0xc000000000000000); + status |= + test__divdf3(0x7feffffffffffffd, 0x4010000000000000, 0x7fcffffffffffffd); + status |= + test__divdf3(0x7feffffffffffffd, 0xc010000000000000, 0xffcffffffffffffd); + status |= + test__divdf3(0x7fefffffffffffff, 0x0000000000000001, 0x7ff0000000000000); + status |= + test__divdf3(0x7fefffffffffffff, 0x3fefffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0x7fefffffffffffff, 0x7fcfffffffffffff, 0x4010000000000000); + status |= + test__divdf3(0x7fefffffffffffff, 0x7fdfffffffffffff, 0x4000000000000000); + status |= + test__divdf3(0x7fefffffffffffff, 0xc000000000000000, 0xffdfffffffffffff); + status |= + test__divdf3(0x7fefffffffffffff, 0xffcfffffffffffff, 0xc010000000000000); + status |= + test__divdf3(0x7fefffffffffffff, 0xfff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x0000000000000001, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x000fffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x0010000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x001fffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x3ff0000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x4014000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x7fdfffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x7fe0000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x8000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x8000000000000002, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x800fffffffffffff, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x8010000000000001, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x8020000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0xc008000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0xc01c000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0xffcfffffffffffff, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0xffe0000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0xffefffffffffffff, 0xfff0000000000000); + status |= + test__divdf3(0x8000000000000000, 0x0000000000000003, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x000fffffffffffff, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x0010000000000001, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x0020000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x4000000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x4018000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x7fcfffffffffffff, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x7fd0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x8000000000000004, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x800fffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x8010000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0x801fffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0xc010000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0xc020000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0xffd0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0xffdfffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000000, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000001, 0x3fe0000000000000, 0x8000000000000002); + status |= + test__divdf3(0x8000000000000001, 0x4000000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000001, 0x7fefffffffffffff, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000001, 0xc000000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000001, 0xffefffffffffffff, 0x0000000000000000); + status |= + test__divdf3(0x8000000000000003, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x8000000000000003, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8000000000000004, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x8000000000000004, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x800ffffffffffff8, 0x3feffffffffffffe, 0x800ffffffffffff9); + status |= + test__divdf3(0x800fffffffffffff, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x800fffffffffffff, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x800fffffffffffff, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x800fffffffffffff, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8010000000000000, 0x3ff0000000000001, 0x800fffffffffffff); + status |= + test__divdf3(0x8010000000000000, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x8010000000000000, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8010000000000001, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x8010000000000001, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x801fffffffffffff, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0x801fffffffffffff, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0x8020000000000000, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0x8020000000000000, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0x8020000000000001, 0x0010000000000001, 0xc000000000000000); + status |= + test__divdf3(0x8020000000000005, 0x0010000000000005, 0xc000000000000000); + status |= + test__divdf3(0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000); + status |= + test__divdf3(0xbff0000000000000, 0xbff0000000000000, 0x3ff0000000000000); + status |= + test__divdf3(0xc000000000000000, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xc000000000000000, 0x3ff0000000000000, 0xc000000000000000); + status |= + test__divdf3(0xc000000000000000, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0xc000000000000000, 0xbff0000000000000, 0x4000000000000000); + status |= + test__divdf3(0xc010000000000000, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xc010000000000000, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0xc018000000000000, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xc018000000000000, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0xc018000000000000, 0xc008000000000000, 0x4000000000000000); + status |= + test__divdf3(0xc01c000000000000, 0x401c000000000000, 0xbff0000000000000); + status |= + test__divdf3(0xc020000000000000, 0x4000000000000000, 0xc010000000000000); + status |= + test__divdf3(0xc020000000000000, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xc020000000000000, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0xc022000000000000, 0xc008000000000000, 0x4008000000000000); + status |= + test__divdf3(0xffcfffffffffffff, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xffcfffffffffffff, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0xffd0000000000000, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xffd0000000000000, 0x7ff0000000000000, 0x8000000000000000); + status |= + test__divdf3(0xffd0000000000000, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xffd0000000000000, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0xffdfffffffffffff, 0x4000000000000000, 0xffcfffffffffffff); + status |= + test__divdf3(0xffdfffffffffffff, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xffe0000000000000, 0x3fe0000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xffe0000000000000, 0xbfe0000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xffe0000000000001, 0x7fd0000000000001, 0xc000000000000000); + status |= + test__divdf3(0xffeffffffffffffd, 0x4010000000000000, 0xffcffffffffffffd); + status |= + test__divdf3(0xffeffffffffffffd, 0xc010000000000000, 0x7fcffffffffffffd); + status |= + test__divdf3(0xffefffffffffffff, 0x7fcfffffffffffff, 0xc010000000000000); + status |= + test__divdf3(0xffefffffffffffff, 0xffcfffffffffffff, 0x4010000000000000); + status |= + test__divdf3(0xffefffffffffffff, 0xfff0000000000000, 0x0000000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x0000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x0000000000000003, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x000fffffffffffff, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x0010000000000001, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x0020000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x4000000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x4018000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x7fd0000000000000, 0xfff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x8000000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x8000000000000004, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x800fffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x8010000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x801fffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0xc010000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0xc020000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0xffd0000000000000, 0x7ff0000000000000); + status |= + test__divdf3(0xfff0000000000000, 0xffefffffffffffff, 0x7ff0000000000000); + status |= + test__divdf3(0x800ffffffdffffff, 0xc00fff8000000000, 0x0004000fffbfff00); + status |= + test__divdf3(0xb7fbffffffffffff, 0xffe0000000000007, 0x0000000000000000); + status |= + test__divdf3(0x3ff660beb3029ffd, 0x3ff52e22fb7ace43, 0x3ff0e79e59ccb735); + status |= + test__divdf3(0x3ff73ddbc621eb00, 0x3ffb8224c030d747, 0x3feb095d4073d13b); + status |= + test__divdf3(0x3ff9a3b1ff2bf973, 0x3ff42fdf35d2d3bd, 0x3ff452508f203fca); + status |= + test__divdf3(0x3ffa2f42f2a01655, 0x3ff01310ba9f33d1, 0x3ffa103474220298); + status |= + test__divdf3(0x3ffa6b3e65d68478, 0x3ff773ca580800a9, 0x3ff206204bf651cc); + status |= + test__divdf3(0x3ffae840ed05aaad, 0x3ff374c8afa6bd73, 0x3ff620a0b38357dd); + status |= + test__divdf3(0x3ffc9bff90e124f7, 0x3ff19678d03f31b9, 0x3ffa06ce5731c244); + status |= + test__divdf3(0x3ff716518068f63e, 0x3ffea080001fffff, 0x3fe81f4927e2f813); + status |= + test__divdf3(0x3ff30b70c9e177b3, 0x3ffdc1dbcddeaaf7, 0x3fe47ae453d79b63); + status |= + test__divdf3(0x3ff690a0c1cf289e, 0x3ffdd0e4ec596ead, 0x3fe837c35c721292); + status |= + test__divdf3(0x3ff9a9f18698d1c5, 0x3ffdcf214b672807, 0x3feb8cd196d1e2db); + status |= + test__divdf3(0x3ffc412def95e9f2, 0x3ffe09fd73e44afb, 0x3fee195e4c411819); + status |= + test__divdf3(0x3ffab674f26df917, 0x3ffe55a80dfd623d, 0x3fec2de561fb628a); + status |= + test__divdf3(0x3ff15bb10851a33b, 0x3ffe770229894d4f, 0x3fe23b9bdf3ad4d7); + status |= + test__divdf3(0x3ff6ce035de00c24, 0x3fff04076d288c95, 0x3fe7874738e5ef5e); + status |= + test__divdf3(0x3ffb0e73f83fd2b4, 0x3fff01150ca4f6e3, 0x3febece97e64ff65); + status |= + test__divdf3(0x3ff53fff6c6d7043, 0x3fffb55c0bf15be1, 0x3fe57204f8441410); + status |= + test__divdf3(0x3ffa8aa3bbff7c4b, 0x3fffd530fa74cc5f, 0x3feaae55281a47cf); + status |= + test__divdf3(0x3ff3004b0d901379, 0x3ffe470662686931, 0x3fe41508eef9d818); + status |= + test__divdf3(0x3ffac10f29e80b25, 0x3ffe2fba9d423c9d, 0x3fec5c8a8148eb26); + status |= + test__divdf3(0x3ff8a3e14fe0651f, 0x3ffdeeae50e07679, 0x3fea579ce7a3f61c); + status |= + test__divdf3(0x3ff168321760dd0d, 0x3ffd382a2b3c2c27, 0x3fe31042c5fcbe35); + status |= + test__divdf3(0x3ff208350f930e99, 0x3ffc80beeab6d9ed, 0x3fe43e9486314a0e); + status |= + test__divdf3(0x3ff46a9470b46af6, 0x3ffc2e13c9335b3f, 0x3fe72f150e86f5a1); + status |= + test__divdf3(0x3ffaf26f45d21562, 0x3ffbe6d631b290e7, 0x3feee7b30b353e95); + status |= + test__divdf3(0x3ff5cda6f52381df, 0x3ffbe2a5bce4483f, 0x3fe90542a0e62c21); + status |= + test__divdf3(0x3ff92aeb8209bb69, 0x3ffb57a0bdf7af6f, 0x3fed74754022b839); + status |= + test__divdf3(0x3ff627c9c1a1903d, 0x3ffb3c161457a7e1, 0x3fea082feee891f0); + status |= + test__divdf3(0x3ffa5fef91208fd5, 0x3ff68928392cf5e7, 0x3ff2b9c16cd0a6eb); + status |= + test__divdf3(0x3ffdc6825d6a2ad2, 0x3ff69bb9ca89cd3f, 0x3ff5127c1399515f); + status |= + test__divdf3(0x3ffd62dbb1150699, 0x3ff6e12d3daf7823, 0x3ff48cd52e787bc5); + status |= + test__divdf3(0x3ffb9f0e3f946dd2, 0x3ff75a51f01f688b, 0x3ff2ecadebdfdf91); + status |= + test__divdf3(0x3ffdf21fc13ef609, 0x3ff77a80c8098ae1, 0x3ff46843217c9c90); + status |= + test__divdf3(0x3ff83f6d28924d31, 0x3ff7cb607bcc758f, 0x3ff04e08e26c84b7); + status |= + test__divdf3(0x3ffef8774307cea5, 0x3ff849124d13461d, 0x3ff467851369d61a); + status |= + test__divdf3(0x3ffd7c2259068fa2, 0x3ffa9e9faf8d6845, 0x3ff1b8e24ddeb546); + status |= + test__divdf3(0x3fffb10b35d3977b, 0x3ffb57a0bdf7af6f, 0x3ff28b8abfdd47c7); + status |= + test__divdf3(0x3ffdcfa4097387f1, 0x3ffbe6d631b290e7, 0x3ff1184cf4cac16b); + status |= + test__divdf3(0x3ffcb6231a615d02, 0x3ffb98faef6f9417, 0x3ff0a552a67a8e2d); + status |= + test__divdf3(0x3ffba5443a5d0a42, 0x3ffb3a5c10922a9d, 0x3ff03ed2622d2a26); + status |= + test__divdf3(0x3fff3144ae86b33e, 0x3ffa58948417f235, 0x3ff2f17912d557f2); + status |= + test__divdf3(0x3ffd68635bf6605a, 0x3ff945fce3a79f3f, 0x3ff29e0c7d6617a1); + status |= + test__divdf3(0x3ff97e6030354676, 0x3ff906f78f460697, 0x3ff04c56a5f3136d); + status |= + test__divdf3(0x3ffe86f743594e95, 0x3ff8444d7946422d, 0x3ff420b1e63f512e); + status |= + test__divdf3(0x3fff12a6c5539a9a, 0x3ff7cad48079af09, 0x3ff4e564f736b864); + status |= + test__divdf3(0x3ffa5371fe989251, 0x3ff6fc5272dc36d1, 0x3ff2533d7a4d0ee8); + status |= + test__divdf3(0x3ffe18c0547f65d2, 0x3ff6fc9e8dd915ed, 0x3ff4f2e7f917b80e); + status |= + test__divdf3(0x3ffd7aea8a297055, 0x3ff64eb95d608cd9, 0x3ff52500dc28664c); + + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultD, so we set all the answers to the canonical NaN + // 0x7ff8000000000000, which causes compareResultF to accept any NaN + // encoding. We also use the same value as the input NaN in tests that have + // one, so that even in EXPECT_EXACT_RESULTS mode these tests should pass, + // because 0x7ff8000000000000 is still the exact expected NaN. + status |= + test__divdf3(0x0000000000000000, 0x0000000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x0000000000000000, 0x8000000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x8000000000000000, 0x0000000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x8000000000000000, 0x8000000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0xfff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0xfff0000000000000, 0xfff0000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x3ff0000000000000, 0x7ff8000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x7ff8000000000000, 0x3ff0000000000000, 0x7ff8000000000000); + status |= + test__divdf3(0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000); + +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/divdf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7ff8000000000000. + status |= + test__divdf3(0x0000000000000000, 0x7ff3758244400801, 0x7ffb758244400801); + status |= + test__divdf3(0x0000000000000000, 0x7fff44d3f65148af, 0x7fff44d3f65148af); + status |= + test__divdf3(0x0000000000000001, 0x7ff48607b4b37057, 0x7ffc8607b4b37057); + status |= + test__divdf3(0x0000000000000001, 0x7ff855f2d435b33d, 0x7ff855f2d435b33d); + status |= + test__divdf3(0x000fffffffffffff, 0x7ff169269a674e13, 0x7ff969269a674e13); + status |= + test__divdf3(0x000fffffffffffff, 0x7ffc80978b2ef0da, 0x7ffc80978b2ef0da); + status |= + test__divdf3(0x3ff0000000000000, 0x7ff3458ad034593d, 0x7ffb458ad034593d); + status |= + test__divdf3(0x3ff0000000000000, 0x7ffdd8bb98c9f13a, 0x7ffdd8bb98c9f13a); + status |= + test__divdf3(0x7fefffffffffffff, 0x7ff79a8b96250a98, 0x7fff9a8b96250a98); + status |= + test__divdf3(0x7fefffffffffffff, 0x7ffdcc675b63bb94, 0x7ffdcc675b63bb94); + status |= + test__divdf3(0x7ff0000000000000, 0x7ff018cfaf4d0fff, 0x7ff818cfaf4d0fff); + status |= + test__divdf3(0x7ff0000000000000, 0x7ff83ad1ab4dfd24, 0x7ff83ad1ab4dfd24); + status |= + test__divdf3(0x7ff48ce6c0cdd5ac, 0x0000000000000000, 0x7ffc8ce6c0cdd5ac); + status |= + test__divdf3(0x7ff08a34f3d5385b, 0x0000000000000001, 0x7ff88a34f3d5385b); + status |= + test__divdf3(0x7ff0a264c1c96281, 0x000fffffffffffff, 0x7ff8a264c1c96281); + status |= + test__divdf3(0x7ff77ce629e61f0e, 0x3ff0000000000000, 0x7fff7ce629e61f0e); + status |= + test__divdf3(0x7ff715e2d147fd76, 0x7fefffffffffffff, 0x7fff15e2d147fd76); + status |= + test__divdf3(0x7ff689a2031f1781, 0x7ff0000000000000, 0x7ffe89a2031f1781); + status |= + test__divdf3(0x7ff5dfb4a0c8cd05, 0x7ff11c1fe9793a33, 0x7ffddfb4a0c8cd05); + status |= + test__divdf3(0x7ff5826283ffb5d7, 0x7fff609b83884e81, 0x7ffd826283ffb5d7); + status |= + test__divdf3(0x7ff7cb03f2e61d42, 0x8000000000000000, 0x7fffcb03f2e61d42); + status |= + test__divdf3(0x7ff2adc8dfe72c96, 0x8000000000000001, 0x7ffaadc8dfe72c96); + status |= + test__divdf3(0x7ff4fc0bacc707f2, 0x800fffffffffffff, 0x7ffcfc0bacc707f2); + status |= + test__divdf3(0x7ff76248c8c9a619, 0xbff0000000000000, 0x7fff6248c8c9a619); + status |= + test__divdf3(0x7ff367972fce131b, 0xffefffffffffffff, 0x7ffb67972fce131b); + status |= + test__divdf3(0x7ff188f5ac284e92, 0xfff0000000000000, 0x7ff988f5ac284e92); + status |= + test__divdf3(0x7ffed4c22e4e569d, 0x0000000000000000, 0x7ffed4c22e4e569d); + status |= + test__divdf3(0x7ffe95105fa3f339, 0x0000000000000001, 0x7ffe95105fa3f339); + status |= + test__divdf3(0x7ffb8d33dbb9ecfb, 0x000fffffffffffff, 0x7ffb8d33dbb9ecfb); + status |= + test__divdf3(0x7ff874e41dc63e07, 0x3ff0000000000000, 0x7ff874e41dc63e07); + status |= + test__divdf3(0x7ffe27594515ecdf, 0x7fefffffffffffff, 0x7ffe27594515ecdf); + status |= + test__divdf3(0x7ffeac86d5c69bdf, 0x7ff0000000000000, 0x7ffeac86d5c69bdf); + status |= + test__divdf3(0x7ff97d657b99f76f, 0x7ff7e4149862a796, 0x7fffe4149862a796); + status |= + test__divdf3(0x7ffad17c6aa33fad, 0x7ffd898893ad4d28, 0x7ffad17c6aa33fad); + status |= + test__divdf3(0x7ff96e04e9c3d173, 0x8000000000000000, 0x7ff96e04e9c3d173); + status |= + test__divdf3(0x7ffec01ad8da3abb, 0x8000000000000001, 0x7ffec01ad8da3abb); + status |= + test__divdf3(0x7ffd1d565c495941, 0x800fffffffffffff, 0x7ffd1d565c495941); + status |= + test__divdf3(0x7ffe3d24f1e474a7, 0xbff0000000000000, 0x7ffe3d24f1e474a7); + status |= + test__divdf3(0x7ffc206f2bb8c8ce, 0xffefffffffffffff, 0x7ffc206f2bb8c8ce); + status |= + test__divdf3(0x7ff93efdecfb7d3b, 0xfff0000000000000, 0x7ff93efdecfb7d3b); + status |= + test__divdf3(0x8000000000000000, 0x7ff2ee725d143ac5, 0x7ffaee725d143ac5); + status |= + test__divdf3(0x8000000000000000, 0x7ffbba26e5c5fe98, 0x7ffbba26e5c5fe98); + status |= + test__divdf3(0x8000000000000001, 0x7ff7818a1cd26df9, 0x7fff818a1cd26df9); + status |= + test__divdf3(0x8000000000000001, 0x7ffaee6cc63b5292, 0x7ffaee6cc63b5292); + status |= + test__divdf3(0x800fffffffffffff, 0x7ff401096edaf79d, 0x7ffc01096edaf79d); + status |= + test__divdf3(0x800fffffffffffff, 0x7ffbf1778c7a2e59, 0x7ffbf1778c7a2e59); + status |= + test__divdf3(0xbff0000000000000, 0x7ff2e8fb0201c496, 0x7ffae8fb0201c496); + status |= + test__divdf3(0xbff0000000000000, 0x7ffcb6a5adb2e154, 0x7ffcb6a5adb2e154); + status |= + test__divdf3(0xffefffffffffffff, 0x7ff1ea1bfc15d71d, 0x7ff9ea1bfc15d71d); + status |= + test__divdf3(0xffefffffffffffff, 0x7ffae0766e21efc0, 0x7ffae0766e21efc0); + status |= + test__divdf3(0xfff0000000000000, 0x7ff3b364cffbdfe6, 0x7ffbb364cffbdfe6); + status |= + test__divdf3(0xfff0000000000000, 0x7ffd0d3223334ae3, 0x7ffd0d3223334ae3); + +#endif // ARM_NAN_HANDLING + + return status; +} diff --git a/compiler-rt/test/builtins/Unit/muldf3new_test.c b/compiler-rt/test/builtins/Unit/muldf3new_test.c new file mode 100644 index 0000000000000..b8a5c64460696 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/muldf3new_test.c @@ -0,0 +1,832 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_muldf3 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +// By default this test uses compareResultD to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7ff8000000000000. For the Arm optimized FP implementation, which commits +// to a more detailed handling of NaNs, we tighten up the check and include +// some extra test cases specific to that NaN policy. +#if COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + +// Returns: a * b +COMPILER_RT_ABI double __muldf3(double a, double b); + +int test__muldf3(uint64_t a_rep, uint64_t b_rep, uint64_t expected_rep, + int line) { + double a = fromRep64(a_rep), b = fromRep64(b_rep); + double x = __muldf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep64(x) != expected_rep; +#else + int ret = compareResultD(x, expected_rep); +#endif + + if (ret) { + printf("error at line %d: __muldf3(%016" PRIx64 ", %016" PRIx64 + ") = %016" PRIx64 ", expected %016" PRIx64 "\n", + line, a_rep, b_rep, toRep64(x), expected_rep); + } + return ret; +} + +#define test__muldf3(a, b, x) test__muldf3(a, b, x, __LINE__) + +int main(void) { + int status = 0; + + status |= + test__muldf3(0x0000000000000000, 0x0000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x000fffffffffffff, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x001fffffffffffff, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x3ff0000000000000, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x7fdfffffffffffff, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x8000000000000000, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x8000000000000002, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x800fffffffffffff, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x8010000000000001, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0x8020000000000000, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0xc008000000000000, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0xffcfffffffffffff, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0xffe0000000000000, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000000, 0xffefffffffffffff, 0x8000000000000000); + status |= + test__muldf3(0x0000000000000001, 0x0000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000001, 0x0000000000000001, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000001, 0x3fe0000000000000, 0x0000000000000000); + status |= + test__muldf3(0x0000000000000001, 0x3fefffffffffffff, 0x0000000000000001); + status |= + test__muldf3(0x0000000000000001, 0x3ff0000000000000, 0x0000000000000001); + status |= + test__muldf3(0x0000000000000001, 0x4000000000000000, 0x0000000000000002); + status |= + test__muldf3(0x0000000000000001, 0x7ff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x0000000000000001, 0xbfefffffffffffff, 0x8000000000000001); + status |= + test__muldf3(0x0000000000000006, 0x3fe0000000000000, 0x0000000000000003); + status |= + test__muldf3(0x0000000000000006, 0xbfe0000000000000, 0x8000000000000003); + status |= + test__muldf3(0x0000000000000008, 0x3fc0000000000000, 0x0000000000000001); + status |= + test__muldf3(0x000ffffffffffff7, 0x8020000000000003, 0x8000000000000000); + status |= + test__muldf3(0x000ffffffffffff8, 0x3ff0000000000001, 0x000ffffffffffff9); + status |= + test__muldf3(0x000ffffffffffff8, 0x3ff0000000000008, 0x0010000000000000); + status |= + test__muldf3(0x000ffffffffffff8, 0xbff0000000000001, 0x800ffffffffffff9); + status |= + test__muldf3(0x000ffffffffffff8, 0xbff0000000000008, 0x8010000000000000); + status |= + test__muldf3(0x000ffffffffffffc, 0x4000000000000000, 0x001ffffffffffff8); + status |= + test__muldf3(0x000ffffffffffffe, 0x3feffffffffffffc, 0x000ffffffffffffc); + status |= + test__muldf3(0x000ffffffffffffe, 0x3ff0000000000001, 0x000fffffffffffff); + status |= + test__muldf3(0x000ffffffffffffe, 0xbff0000000000001, 0x800fffffffffffff); + status |= + test__muldf3(0x000fffffffffffff, 0x000ffffffffffffe, 0x0000000000000000); + status |= + test__muldf3(0x000fffffffffffff, 0x3cb0000000000001, 0x0000000000000001); + status |= + test__muldf3(0x000fffffffffffff, 0x3fe0000000000001, 0x0008000000000000); + status |= + test__muldf3(0x000fffffffffffff, 0x3ff0000000000001, 0x0010000000000000); + status |= + test__muldf3(0x000fffffffffffff, 0x4000000000000000, 0x001ffffffffffffe); + status |= + test__muldf3(0x0010000000000000, 0x0000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x0010000000000000, 0x0010000000000000, 0x0000000000000000); + status |= + test__muldf3(0x0010000000000000, 0x3feffffffffffffe, 0x000fffffffffffff); + status |= + test__muldf3(0x0010000000000000, 0x7ff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x0010000000000000, 0x8010000000000000, 0x8000000000000000); + status |= + test__muldf3(0x0010000000000000, 0xc000000000000000, 0x8020000000000000); + status |= + test__muldf3(0x0010000000000001, 0x3feffffffffffffa, 0x000ffffffffffffe); + status |= + test__muldf3(0x0010000000000001, 0x3feffffffffffffe, 0x0010000000000000); + status |= + test__muldf3(0x0010000000000001, 0xc000000000000000, 0x8020000000000001); + status |= + test__muldf3(0x0010000000000002, 0x3feffffffffffffc, 0x0010000000000000); + status |= + test__muldf3(0x001ffffffffffff8, 0x3fe0000000000000, 0x000ffffffffffffc); + status |= + test__muldf3(0x001ffffffffffffe, 0x3fe0000000000000, 0x000fffffffffffff); + status |= + test__muldf3(0x001ffffffffffffe, 0xbfe0000000000000, 0x800fffffffffffff); + status |= + test__muldf3(0x001fffffffffffff, 0x3fe0000000000000, 0x0010000000000000); + status |= + test__muldf3(0x001fffffffffffff, 0xbfe0000000000000, 0x8010000000000000); + status |= + test__muldf3(0x3fe0000000000000, 0x8000000000000001, 0x8000000000000000); + status |= + test__muldf3(0x3ff0000000000000, 0x000ffffffffffffd, 0x000ffffffffffffd); + status |= + test__muldf3(0x3ff0000000000000, 0x0020000000000003, 0x0020000000000003); + status |= + test__muldf3(0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000); + status |= + test__muldf3(0x3ff0000000000000, 0x4000000000000000, 0x4000000000000000); + status |= + test__muldf3(0x3ff0000000000000, 0x8000000000000001, 0x8000000000000001); + status |= + test__muldf3(0x3ff0000000000000, 0x8000000000000009, 0x8000000000000009); + status |= + test__muldf3(0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000002); + status |= + test__muldf3(0x3ff0000000000001, 0xbff0000000000001, 0xbff0000000000002); + status |= + test__muldf3(0x3ff0000000000001, 0xbff0000000000002, 0xbff0000000000003); + status |= + test__muldf3(0x3ff0000000000002, 0x3ff0000000000001, 0x3ff0000000000003); + status |= + test__muldf3(0x3ff0000000000002, 0x7feffffffffffffe, 0x7ff0000000000000); + status |= + test__muldf3(0x3ff0000000000001, 0x7feffffffffffffe, 0x7ff0000000000000); + status |= + test__muldf3(0x4000000000000000, 0x0010000000000000, 0x0020000000000000); + status |= + test__muldf3(0x4000000000000000, 0x0010000000000001, 0x0020000000000001); + status |= + test__muldf3(0x4000000000000000, 0x3ff0000000000000, 0x4000000000000000); + status |= + test__muldf3(0x4000000000000000, 0x4008000000000000, 0x4018000000000000); + status |= + test__muldf3(0x4000000000000000, 0x7fd0000000000000, 0x7fe0000000000000); + status |= + test__muldf3(0x4000000000000000, 0x7fdfffffffffffff, 0x7fefffffffffffff); + status |= + test__muldf3(0x4000000000000000, 0x800ffffffffffffd, 0x801ffffffffffffa); + status |= + test__muldf3(0x4000000000000000, 0x8010000000000003, 0x8020000000000003); + status |= + test__muldf3(0x4000000000000000, 0x8010000000000005, 0x8020000000000005); + status |= + test__muldf3(0x4000000000000000, 0xbff0000000000000, 0xc000000000000000); + status |= + test__muldf3(0x4000000000000000, 0xffcffffffffffffd, 0xffdffffffffffffd); + status |= + test__muldf3(0x4000000000000000, 0xffd0000000000003, 0xffe0000000000003); + status |= + test__muldf3(0x4007ffffffffffff, 0x3feffffffffffffd, 0x4007fffffffffffd); + status |= + test__muldf3(0x4007ffffffffffff, 0x3feffffffffffffe, 0x4007fffffffffffe); + status |= + test__muldf3(0x4007ffffffffffff, 0x3fefffffffffffff, 0x4007fffffffffffe); + status |= + test__muldf3(0x4007ffffffffffff, 0xbfeffffffffffffd, 0xc007fffffffffffd); + status |= + test__muldf3(0x4008000000000000, 0x0000000000000002, 0x0000000000000006); + status |= + test__muldf3(0x4008000000000000, 0x4000000000000000, 0x4018000000000000); + status |= + test__muldf3(0x4008000000000000, 0x4008000000000000, 0x4022000000000000); + status |= + test__muldf3(0x4008000000000000, 0xc000000000000000, 0xc018000000000000); + status |= + test__muldf3(0x4008000000000001, 0x3ff0000000000001, 0x4008000000000003); + status |= + test__muldf3(0x4008000000000001, 0x3ff0000000000003, 0x4008000000000006); + status |= + test__muldf3(0x4008000000000001, 0xbff0000000000003, 0xc008000000000006); + status |= + test__muldf3(0x4010000000000000, 0x0000000000000002, 0x0000000000000008); + status |= + test__muldf3(0x4010000000000000, 0x7fcfffffffffffff, 0x7fefffffffffffff); + status |= + test__muldf3(0x4010000000000000, 0xffcfffffffffffff, 0xffefffffffffffff); + status |= + test__muldf3(0x4013ffffffffffff, 0x3fefffffffffffff, 0x4013fffffffffffe); + status |= + test__muldf3(0x4014000000000000, 0x0000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x4014000000000000, 0x7ff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x4014000000000001, 0x3ff0000000000001, 0x4014000000000002); + status |= + test__muldf3(0x401bffffffffffff, 0x3feffffffffffffc, 0x401bfffffffffffc); + status |= + test__muldf3(0x401bffffffffffff, 0x3fefffffffffffff, 0x401bfffffffffffe); + status |= + test__muldf3(0x401c000000000000, 0x8000000000000000, 0x8000000000000000); + status |= + test__muldf3(0x401c000000000000, 0xfff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x401c000000000001, 0x3ff0000000000001, 0x401c000000000003); + status |= + test__muldf3(0x7fcffffffffffffd, 0x4010000000000000, 0x7feffffffffffffd); + status |= + test__muldf3(0x7fcffffffffffffd, 0xc010000000000000, 0xffeffffffffffffd); + status |= + test__muldf3(0x7fd0000000000000, 0xc000000000000000, 0xffe0000000000000); + status |= + test__muldf3(0x7fdffffffffffffd, 0xc000000000000008, 0xfff0000000000000); + status |= + test__muldf3(0x7fdfffffffffffff, 0xc000000000000000, 0xffefffffffffffff); + status |= + test__muldf3(0x7fe0000000000000, 0x0000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x7fe0000000000000, 0x4000000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x7fe0000000000000, 0x7fe0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x7fe0000000000000, 0x7feffffffffffffe, 0x7ff0000000000000); + status |= + test__muldf3(0x7fe0000000000000, 0x7ff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x7fe0000000000000, 0xffd0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x7fe0000000000000, 0xffd0000000000004, 0xfff0000000000000); + status |= + test__muldf3(0x7fe0000000000000, 0xffe0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x7fe0000000000009, 0x7feffffffffffffa, 0x7ff0000000000000); + status |= + test__muldf3(0x7fe0000000000009, 0xc018000000000002, 0xfff0000000000000); + status |= + test__muldf3(0x7fefffffffffffff, 0x0000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x000fffffffffffff, 0x7ff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x001fffffffffffff, 0x7ff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x3ff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x7fdfffffffffffff, 0x7ff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x8000000000000002, 0xfff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x800fffffffffffff, 0xfff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x8010000000000001, 0xfff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x8020000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0xc008000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0xffe0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0xffefffffffffffff, 0xfff0000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0xfff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x8000000000000000, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0x8000000000000000, 0x4018000000000000, 0x8000000000000000); + status |= + test__muldf3(0x8000000000000000, 0x7fefffffffffffff, 0x8000000000000000); + status |= + test__muldf3(0x8000000000000000, 0x8000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x8000000000000000, 0x8000000000000004, 0x0000000000000000); + status |= + test__muldf3(0x8000000000000000, 0x8010000000000000, 0x0000000000000000); + status |= + test__muldf3(0x8000000000000000, 0xc020000000000000, 0x0000000000000000); + status |= + test__muldf3(0x8000000000000000, 0xffd0000000000000, 0x0000000000000000); + status |= + test__muldf3(0x8000000000000001, 0x0000000000000001, 0x8000000000000000); + status |= + test__muldf3(0x8000000000000001, 0x4014000000000000, 0x8000000000000005); + status |= + test__muldf3(0x8000000000000002, 0x3ff0000000000000, 0x8000000000000002); + status |= + test__muldf3(0x8000000000000003, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0x8000000000000003, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x8000000000000004, 0xbff0000000000000, 0x0000000000000004); + status |= + test__muldf3(0x8000000000000008, 0x3fc0000000000000, 0x8000000000000001); + status |= + test__muldf3(0x800ffffffffffff7, 0x0020000000000003, 0x8000000000000000); + status |= + test__muldf3(0x800ffffffffffff7, 0x3ff0000000000001, 0x800ffffffffffff8); + status |= + test__muldf3(0x800ffffffffffffd, 0xc000000000000000, 0x001ffffffffffffa); + status |= + test__muldf3(0x800fffffffffffff, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0x800fffffffffffff, 0x3ff0000000000001, 0x8010000000000000); + status |= + test__muldf3(0x800fffffffffffff, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x800fffffffffffff, 0x8000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x800fffffffffffff, 0x800ffffffffffffe, 0x0000000000000000); + status |= + test__muldf3(0x800fffffffffffff, 0xbff0000000000000, 0x000fffffffffffff); + status |= + test__muldf3(0x800fffffffffffff, 0xfff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x8010000000000000, 0x0010000000000000, 0x8000000000000000); + status |= + test__muldf3(0x8010000000000000, 0x8010000000000000, 0x0000000000000000); + status |= + test__muldf3(0x8010000000000001, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0x8010000000000001, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0x8010000000000001, 0xbff0000000000000, 0x0010000000000001); + status |= + test__muldf3(0x801ffffffffffffc, 0x3fe0000000000000, 0x800ffffffffffffe); + status |= + test__muldf3(0x801ffffffffffffc, 0xbfe0000000000000, 0x000ffffffffffffe); + status |= + test__muldf3(0x801ffffffffffffe, 0x3ff0000000000000, 0x801ffffffffffffe); + status |= + test__muldf3(0x801fffffffffffff, 0x8000000000000000, 0x0000000000000000); + status |= + test__muldf3(0x801fffffffffffff, 0xfff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x8020000000000000, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0x8020000000000000, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xbfefffffffffffff, 0xffefffffffffffff, 0x7feffffffffffffe); + status |= + test__muldf3(0xbff0000000000000, 0x0000000000000009, 0x8000000000000009); + status |= + test__muldf3(0xbff0000000000000, 0x0010000000000009, 0x8010000000000009); + status |= + test__muldf3(0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000); + status |= + test__muldf3(0xbff0000000000000, 0x4000000000000000, 0xc000000000000000); + status |= + test__muldf3(0xbff0000000000000, 0xbff0000000000000, 0x3ff0000000000000); + status |= + test__muldf3(0xbff0000000000000, 0xc000000000000000, 0x4000000000000000); + status |= + test__muldf3(0xbff0000000000001, 0x3ff0000000000001, 0xbff0000000000002); + status |= + test__muldf3(0xbff0000000000001, 0xbff0000000000001, 0x3ff0000000000002); + status |= + test__muldf3(0xbff0000000000001, 0xbff0000000000002, 0x3ff0000000000003); + status |= + test__muldf3(0xbff0000000000002, 0x3ff0000000000001, 0xbff0000000000003); + status |= + test__muldf3(0xbff0000000000002, 0xbff0000000000001, 0x3ff0000000000003); + status |= + test__muldf3(0xc000000000000000, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0xc000000000000000, 0x000ffffffffffffd, 0x801ffffffffffffa); + status |= + test__muldf3(0xc000000000000000, 0x0010000000000001, 0x8020000000000001); + status |= + test__muldf3(0xc000000000000000, 0x0010000000000005, 0x8020000000000005); + status |= + test__muldf3(0xc000000000000000, 0x0010000000000009, 0x8020000000000009); + status |= + test__muldf3(0xc000000000000000, 0x4008000000000000, 0xc018000000000000); + status |= + test__muldf3(0xc000000000000000, 0x7fcfffffffffffff, 0xffdfffffffffffff); + status |= + test__muldf3(0xc000000000000000, 0x7fd0000000000001, 0xffe0000000000001); + status |= + test__muldf3(0xc000000000000000, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xc000000000000000, 0xbff0000000000000, 0x4000000000000000); + status |= + test__muldf3(0xc000000000000000, 0xc008000000000000, 0x4018000000000000); + status |= + test__muldf3(0xc007fffffffffffe, 0x7fe0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xc007ffffffffffff, 0x3fefffffffffffff, 0xc007fffffffffffe); + status |= + test__muldf3(0xc008000000000000, 0x4008000000000000, 0xc022000000000000); + status |= + test__muldf3(0xc008000000000000, 0xc000000000000000, 0x4018000000000000); + status |= + test__muldf3(0xc008000000000000, 0xc008000000000000, 0x4022000000000000); + status |= + test__muldf3(0xc008000000000000, 0xffe0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xc008000000000001, 0x3ff0000000000001, 0xc008000000000003); + status |= + test__muldf3(0xc010000000000000, 0x7fcfffffffffffff, 0xffefffffffffffff); + status |= + test__muldf3(0xc010000000000000, 0x8000000000000000, 0x0000000000000000); + status |= + test__muldf3(0xc010000000000000, 0xffcfffffffffffff, 0x7fefffffffffffff); + status |= + test__muldf3(0xc010000000000000, 0xfff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xc013fffffffffffe, 0xffe0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xc013ffffffffffff, 0xbfefffffffffffff, 0x4013fffffffffffe); + status |= + test__muldf3(0xc014000000000001, 0xbff0000000000001, 0x4014000000000002); + status |= + test__muldf3(0xc01bfffffffffff9, 0x7fe0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xc022000000000000, 0x7fe0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xc022000000000001, 0xffe0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xffcffffffffffff9, 0x7fe0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xffcffffffffffff9, 0xc00fffffffffffff, 0x7feffffffffffff8); + status |= + test__muldf3(0xffcffffffffffffd, 0x4010000000000000, 0xffeffffffffffffd); + status |= + test__muldf3(0xffcffffffffffffd, 0xc010000000000000, 0x7feffffffffffffd); + status |= + test__muldf3(0xffcfffffffffffff, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0xffcfffffffffffff, 0x4000000000000001, 0xffe0000000000000); + status |= + test__muldf3(0xffcfffffffffffff, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xffd0000000000000, 0x0000000000000000, 0x8000000000000000); + status |= + test__muldf3(0xffd0000000000000, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xffdffffffffffff7, 0x7fd0000000000001, 0xfff0000000000000); + status |= + test__muldf3(0xffdfffffffffffff, 0x3ff0000000000001, 0xffe0000000000000); + status |= + test__muldf3(0xffdfffffffffffff, 0x8000000000000000, 0x0000000000000000); + status |= + test__muldf3(0xffe0000000000005, 0xffe0000000000001, 0x7ff0000000000000); + status |= + test__muldf3(0xffeffffffffffffd, 0x7fe0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xffeffffffffffffd, 0xc008000000000001, 0x7ff0000000000000); + status |= + test__muldf3(0xffeffffffffffffd, 0xffe0000000000001, 0x7ff0000000000000); + status |= + test__muldf3(0xffefffffffffffff, 0x8000000000000000, 0x0000000000000000); + status |= + test__muldf3(0xffefffffffffffff, 0xffefffffffffffff, 0x7ff0000000000000); + status |= + test__muldf3(0xffefffffffffffff, 0xfff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xfff0000000000000, 0x4018000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xfff0000000000000, 0x7ff0000000000000, 0xfff0000000000000); + status |= + test__muldf3(0xfff0000000000000, 0x8000000000000004, 0x7ff0000000000000); + status |= + test__muldf3(0xfff0000000000000, 0x8010000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xfff0000000000000, 0xc020000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xfff0000000000000, 0xffd0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x002ffffffe000000, 0x3fcffffffffffffd, 0x000ffffffeffffff); + status |= + test__muldf3(0xbfeffeffffffffff, 0x8010000000000100, 0x000fff80000000ff); + status |= + test__muldf3(0x802ffffffe000000, 0x3fcffffffffffffd, 0x800ffffffeffffff); + status |= + test__muldf3(0xbfeffeffffffffff, 0x0010000000000100, 0x800fff80000000ff); + status |= + test__muldf3(0xbf9e8325a5aa6c8d, 0xbf9e8325a5aa6c8d, 0x3f4d180013083955); + status |= + test__muldf3(0x3ffd25d7ea4fa2d4, 0x3fe4000000000000, 0x3ff237a6f271c5c4); + status |= + test__muldf3(0x6ffd25d7ea4fa2d4, 0x4fe4000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x201d25d7ea4fa2d4, 0x1fd4000000000000, 0x00091bd37938e2e2); + status |= + test__muldf3(0x3ffd25d7ea4fa2d4, 0x3fe8000000000000, 0x3ff5dc61efbbba1f); + status |= + test__muldf3(0x6ffd25d7ea4fa2d4, 0x4fe8000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x201d25d7ea4fa2d4, 0x1fd8000000000000, 0x000aee30f7dddd10); + status |= + test__muldf3(0x3ffd25d7ea4fa2d4, 0x3fec000000000000, 0x3ff9811ced05ae7a); + status |= + test__muldf3(0x6ffd25d7ea4fa2d4, 0x4fec000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x201d25d7ea4fa2d4, 0x1fdc000000000000, 0x000cc08e7682d73d); + status |= + test__muldf3(0x3ff265f139b6c87c, 0x3ff7000000000000, 0x3ffa728ac2f6c032); + status |= + test__muldf3(0x6ff265f139b6c87c, 0x4ff7000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x201265f139b6c87c, 0x1fe7000000000000, 0x000d3945617b6019); + status |= + test__muldf3(0x3ff265f139b6c87c, 0x3ff5000000000000, 0x3ff825cc9bbfe723); + status |= + test__muldf3(0x6ff265f139b6c87c, 0x4ff5000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x201265f139b6c87c, 0x1fe5000000000000, 0x000c12e64ddff391); + status |= + test__muldf3(0x3ffe5ab1dc9f12f9, 0x3ff0c1a10c80f0b7, 0x3fffca09666ab16e); + status |= + test__muldf3(0x6ffe5ab1dc9f12f9, 0x4ff0c1a10c80f0b7, 0x7ff0000000000000); + status |= + test__muldf3(0x201e5ab1dc9f12f9, 0x1fe0c1a10c80f0b7, 0x000fe504b33558b7); + status |= + test__muldf3(0x3ffe5ab1dc9f12f9, 0x3fe73e5ef37f0f49, 0x3ff60c59a0917f00); + status |= + test__muldf3(0x6ffe5ab1dc9f12f9, 0x4fe73e5ef37f0f49, 0x7ff0000000000000); + status |= + test__muldf3(0x201e5ab1dc9f12f9, 0x1fd73e5ef37f0f49, 0x000b062cd048bf80); + status |= + test__muldf3(0x3ffe5ab1dc9f12f9, 0x3fe8c1a10c80f0b7, 0x3ff77bb12a5d1d75); + status |= + test__muldf3(0x6ffe5ab1dc9f12f9, 0x4fe8c1a10c80f0b7, 0x7ff0000000000000); + status |= + test__muldf3(0x201e5ab1dc9f12f9, 0x1fd8c1a10c80f0b7, 0x000bbdd8952e8ebb); + status |= + test__muldf3(0x3ffc6be665de3b1d, 0x3fe52d156619a0cb, 0x3ff2ced9f056fba8); + status |= + test__muldf3(0x6ffc6be665de3b1d, 0x4fe52d156619a0cb, 0x7ff0000000000000); + status |= + test__muldf3(0x201c6be665de3b1d, 0x1fd52d156619a0cb, 0x0009676cf82b7dd4); + status |= + test__muldf3(0x3ffc6be665de3b1d, 0x3fead2ea99e65f35, 0x3ff7d2ffa8765d03); + status |= + test__muldf3(0x6ffc6be665de3b1d, 0x4fead2ea99e65f35, 0x7ff0000000000000); + status |= + test__muldf3(0x201c6be665de3b1d, 0x1fdad2ea99e65f35, 0x000be97fd43b2e82); + status |= + test__muldf3(0x3ff1c0635d3cd39d, 0x3ff5c9b956d0b54b, 0x3ff82c50eb71ac34); + status |= + test__muldf3(0x6ff1c0635d3cd39d, 0x4ff5c9b956d0b54b, 0x7ff0000000000000); + status |= + test__muldf3(0x2011c0635d3cd39d, 0x1fe5c9b956d0b54b, 0x000c162875b8d61a); + status |= + test__muldf3(0x3ff1c0635d3cd39d, 0x3ff23646a92f4ab5, 0x3ff434a77da664d4); + status |= + test__muldf3(0x6ff1c0635d3cd39d, 0x4ff23646a92f4ab5, 0x7ff0000000000000); + status |= + test__muldf3(0x2011c0635d3cd39d, 0x1fe23646a92f4ab5, 0x000a1a53bed3326a); + status |= + test__muldf3(0x3ff1c0635d3cd39d, 0x3ffa3646a92f4ab5, 0x3ffd14d92c44cea3); + status |= + test__muldf3(0x6ff1c0635d3cd39d, 0x4ffa3646a92f4ab5, 0x7ff0000000000000); + status |= + test__muldf3(0x2011c0635d3cd39d, 0x1fea3646a92f4ab5, 0x000e8a6c96226751); + status |= + test__muldf3(0x3ff1c0635d3cd39d, 0x3ff1c9b956d0b54b, 0x3ff3bc381422774d); + status |= + test__muldf3(0x6ff1c0635d3cd39d, 0x4ff1c9b956d0b54b, 0x7ff0000000000000); + status |= + test__muldf3(0x2011c0635d3cd39d, 0x1fe1c9b956d0b54b, 0x0009de1c0a113ba6); + status |= + test__muldf3(0x3ff907065fd11389, 0x3fe46bad37af52b9, 0x3feff135e5756ec7); + status |= + test__muldf3(0x6ff907065fd11389, 0x4fe46bad37af52b9, 0x7feff135e5756ec7); + status |= + test__muldf3(0x201907065fd11389, 0x1fd46bad37af52b9, 0x0007fc4d795d5bb2); + status |= + test__muldf3(0x3ff907065fd11389, 0x3feb9452c850ad47, 0x3ff591ee9cfee5ea); + status |= + test__muldf3(0x6ff907065fd11389, 0x4feb9452c850ad47, 0x7ff0000000000000); + status |= + test__muldf3(0x201907065fd11389, 0x1fdb9452c850ad47, 0x000ac8f74e7f72f5); + status |= + test__muldf3(0x3ff761c03e198df7, 0x3fe7f47c731d43c7, 0x3ff180e675617e83); + status |= + test__muldf3(0x6ff761c03e198df7, 0x4fe7f47c731d43c7, 0x7ff0000000000000); + status |= + test__muldf3(0x201761c03e198df7, 0x1fd7f47c731d43c7, 0x0008c0733ab0bf41); + status |= + test__muldf3(0x3ffce6d1246c46fb, 0x3ff0b3469ded2bcd, 0x3ffe2aa6f74c0ffd); + status |= + test__muldf3(0x6ffce6d1246c46fb, 0x4ff0b3469ded2bcd, 0x7ff0000000000000); + status |= + test__muldf3(0x201ce6d1246c46fb, 0x1fe0b3469ded2bcd, 0x000f15537ba607fe); + status |= + test__muldf3(0x3ffd5701100ec79d, 0x3fee654fee13094b, 0x3ffbde74e37bb583); + status |= + test__muldf3(0x6ffd5701100ec79d, 0x4fee654fee13094b, 0x7ff0000000000000); + status |= + test__muldf3(0x201d5701100ec79d, 0x1fde654fee13094b, 0x000def3a71bddac1); + status |= + test__muldf3(0x3ffce1a06e8bcfd3, 0x3ff01c54436a605b, 0x3ffd14c361885d61); + status |= + test__muldf3(0x6ffce1a06e8bcfd3, 0x4ff01c54436a605b, 0x7ff0000000000000); + status |= + test__muldf3(0x201ce1a06e8bcfd3, 0x1fe01c54436a605b, 0x000e8a61b0c42eb0); + status |= + test__muldf3(0x3ff21d1a5ca518a5, 0x3ff29f0ce1150f2d, 0x3ff514cd72d743f2); + status |= + test__muldf3(0x6ff21d1a5ca518a5, 0x4ff29f0ce1150f2d, 0x7ff0000000000000); + status |= + test__muldf3(0x20121d1a5ca518a5, 0x1fe29f0ce1150f2d, 0x000a8a66b96ba1f9); + status |= + test__muldf3(0x3ff031a98dbf97ba, 0x3ff4000000000000, 0x3ff43e13f12f7da8); + status |= + test__muldf3(0x6ff031a98dbf97ba, 0x4ff4000000000000, 0x7ff0000000000000); + status |= + test__muldf3(0x201031a98dbf97ba, 0x1fe4000000000000, 0x000a1f09f897bed4); + status |= + test__muldf3(0x0000000000000003, 0xc00fffffffffffff, 0x800000000000000c); + status |= + test__muldf3(0x0000000000000003, 0x400fffffffffffff, 0x000000000000000c); + status |= + test__muldf3(0x8000000000000003, 0xc00fffffffffffff, 0x000000000000000c); + status |= + test__muldf3(0x8000000000000003, 0x400fffffffffffff, 0x800000000000000c); + status |= + test__muldf3(0x0000000000000003, 0xc00ffffffffffffd, 0x800000000000000c); + status |= + test__muldf3(0x0000000000000003, 0x400ffffffffffffd, 0x000000000000000c); + status |= + test__muldf3(0x8000000000000003, 0xc00ffffffffffffd, 0x000000000000000c); + status |= + test__muldf3(0x8000000000000003, 0x400ffffffffffffd, 0x800000000000000c); + status |= + test__muldf3(0x1e51f703ee090000, 0x1e5c8000e4000000, 0x0000000000000001); + status |= + test__muldf3(0x1e561ed9745fdb21, 0x1e57255ca25b68e1, 0x0000000000000001); + status |= + test__muldf3(0x7feffffffff00000, 0xc000000000080000, 0xfff0000000000000); + + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultD, so we set all the answers to the canonical NaN + // 0x7ff8000000000000, which causes compareResultF to accept any NaN + // encoding. We also use the same value as the input NaN in tests that have + // one, so that even in EXPECT_EXACT_RESULTS mode these tests should pass, + // because 0x7ff8000000000000 is still the exact expected NaN. + status |= + test__muldf3(0x7ff0000000000000, 0x0000000000000000, 0x7ff8000000000000); + status |= + test__muldf3(0x7ff0000000000000, 0x8000000000000000, 0x7ff8000000000000); + status |= + test__muldf3(0x8000000000000000, 0x7ff0000000000000, 0x7ff8000000000000); + status |= + test__muldf3(0x8000000000000000, 0xfff0000000000000, 0x7ff8000000000000); + status |= + test__muldf3(0x3ff0000000000000, 0x7ff8000000000000, 0x7ff8000000000000); + status |= + test__muldf3(0x7ff8000000000000, 0x3ff0000000000000, 0x7ff8000000000000); + status |= + test__muldf3(0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000); + +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/muldf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7ff8000000000000. + status |= + test__muldf3(0x0000000000000000, 0x7ff3758244400801, 0x7ffb758244400801); + status |= + test__muldf3(0x0000000000000000, 0x7fff44d3f65148af, 0x7fff44d3f65148af); + status |= + test__muldf3(0x0000000000000001, 0x7ff48607b4b37057, 0x7ffc8607b4b37057); + status |= + test__muldf3(0x0000000000000001, 0x7ff855f2d435b33d, 0x7ff855f2d435b33d); + status |= + test__muldf3(0x000fffffffffffff, 0x7ff169269a674e13, 0x7ff969269a674e13); + status |= + test__muldf3(0x000fffffffffffff, 0x7ffc80978b2ef0da, 0x7ffc80978b2ef0da); + status |= + test__muldf3(0x3ff0000000000000, 0x7ff3458ad034593d, 0x7ffb458ad034593d); + status |= + test__muldf3(0x3ff0000000000000, 0x7ffdd8bb98c9f13a, 0x7ffdd8bb98c9f13a); + status |= + test__muldf3(0x7fefffffffffffff, 0x7ff79a8b96250a98, 0x7fff9a8b96250a98); + status |= + test__muldf3(0x7fefffffffffffff, 0x7ffdcc675b63bb94, 0x7ffdcc675b63bb94); + status |= + test__muldf3(0x7ff0000000000000, 0x7ff018cfaf4d0fff, 0x7ff818cfaf4d0fff); + status |= + test__muldf3(0x7ff0000000000000, 0x7ff83ad1ab4dfd24, 0x7ff83ad1ab4dfd24); + status |= + test__muldf3(0x7ff48ce6c0cdd5ac, 0x0000000000000000, 0x7ffc8ce6c0cdd5ac); + status |= + test__muldf3(0x7ff08a34f3d5385b, 0x0000000000000001, 0x7ff88a34f3d5385b); + status |= + test__muldf3(0x7ff0a264c1c96281, 0x000fffffffffffff, 0x7ff8a264c1c96281); + status |= + test__muldf3(0x7ff77ce629e61f0e, 0x3ff0000000000000, 0x7fff7ce629e61f0e); + status |= + test__muldf3(0x7ff715e2d147fd76, 0x7fefffffffffffff, 0x7fff15e2d147fd76); + status |= + test__muldf3(0x7ff689a2031f1781, 0x7ff0000000000000, 0x7ffe89a2031f1781); + status |= + test__muldf3(0x7ff5dfb4a0c8cd05, 0x7ff11c1fe9793a33, 0x7ffddfb4a0c8cd05); + status |= + test__muldf3(0x7ff5826283ffb5d7, 0x7fff609b83884e81, 0x7ffd826283ffb5d7); + status |= + test__muldf3(0x7ff7cb03f2e61d42, 0x8000000000000000, 0x7fffcb03f2e61d42); + status |= + test__muldf3(0x7ff2adc8dfe72c96, 0x8000000000000001, 0x7ffaadc8dfe72c96); + status |= + test__muldf3(0x7ff4fc0bacc707f2, 0x800fffffffffffff, 0x7ffcfc0bacc707f2); + status |= + test__muldf3(0x7ff76248c8c9a619, 0xbff0000000000000, 0x7fff6248c8c9a619); + status |= + test__muldf3(0x7ff367972fce131b, 0xffefffffffffffff, 0x7ffb67972fce131b); + status |= + test__muldf3(0x7ff188f5ac284e92, 0xfff0000000000000, 0x7ff988f5ac284e92); + status |= + test__muldf3(0x7ffed4c22e4e569d, 0x0000000000000000, 0x7ffed4c22e4e569d); + status |= + test__muldf3(0x7ffe95105fa3f339, 0x0000000000000001, 0x7ffe95105fa3f339); + status |= + test__muldf3(0x7ffb8d33dbb9ecfb, 0x000fffffffffffff, 0x7ffb8d33dbb9ecfb); + status |= + test__muldf3(0x7ff874e41dc63e07, 0x3ff0000000000000, 0x7ff874e41dc63e07); + status |= + test__muldf3(0x7ffe27594515ecdf, 0x7fefffffffffffff, 0x7ffe27594515ecdf); + status |= + test__muldf3(0x7ffeac86d5c69bdf, 0x7ff0000000000000, 0x7ffeac86d5c69bdf); + status |= + test__muldf3(0x7ff97d657b99f76f, 0x7ff7e4149862a796, 0x7fffe4149862a796); + status |= + test__muldf3(0x7ffad17c6aa33fad, 0x7ffd898893ad4d28, 0x7ffad17c6aa33fad); + status |= + test__muldf3(0x7ff96e04e9c3d173, 0x8000000000000000, 0x7ff96e04e9c3d173); + status |= + test__muldf3(0x7ffec01ad8da3abb, 0x8000000000000001, 0x7ffec01ad8da3abb); + status |= + test__muldf3(0x7ffd1d565c495941, 0x800fffffffffffff, 0x7ffd1d565c495941); + status |= + test__muldf3(0x7ffe3d24f1e474a7, 0xbff0000000000000, 0x7ffe3d24f1e474a7); + status |= + test__muldf3(0x7ffc206f2bb8c8ce, 0xffefffffffffffff, 0x7ffc206f2bb8c8ce); + status |= + test__muldf3(0x7ff93efdecfb7d3b, 0xfff0000000000000, 0x7ff93efdecfb7d3b); + status |= + test__muldf3(0x8000000000000000, 0x7ff2ee725d143ac5, 0x7ffaee725d143ac5); + status |= + test__muldf3(0x8000000000000000, 0x7ffbba26e5c5fe98, 0x7ffbba26e5c5fe98); + status |= + test__muldf3(0x8000000000000001, 0x7ff7818a1cd26df9, 0x7fff818a1cd26df9); + status |= + test__muldf3(0x8000000000000001, 0x7ffaee6cc63b5292, 0x7ffaee6cc63b5292); + status |= + test__muldf3(0x800fffffffffffff, 0x7ff401096edaf79d, 0x7ffc01096edaf79d); + status |= + test__muldf3(0x800fffffffffffff, 0x7ffbf1778c7a2e59, 0x7ffbf1778c7a2e59); + status |= + test__muldf3(0xbff0000000000000, 0x7ff2e8fb0201c496, 0x7ffae8fb0201c496); + status |= + test__muldf3(0xbff0000000000000, 0x7ffcb6a5adb2e154, 0x7ffcb6a5adb2e154); + status |= + test__muldf3(0xffefffffffffffff, 0x7ff1ea1bfc15d71d, 0x7ff9ea1bfc15d71d); + status |= + test__muldf3(0xffefffffffffffff, 0x7ffae0766e21efc0, 0x7ffae0766e21efc0); + status |= + test__muldf3(0xfff0000000000000, 0x7ff3b364cffbdfe6, 0x7ffbb364cffbdfe6); + status |= + test__muldf3(0xfff0000000000000, 0x7ffd0d3223334ae3, 0x7ffd0d3223334ae3); + +#endif // ARM_NAN_HANDLING + + return status; +} diff --git a/compiler-rt/test/profile/CMakeLists.txt b/compiler-rt/test/profile/CMakeLists.txt index a6d8a9684508d..213a05032ed80 100644 --- a/compiler-rt/test/profile/CMakeLists.txt +++ b/compiler-rt/test/profile/CMakeLists.txt @@ -22,6 +22,17 @@ pythonize_bool(LLVM_ENABLE_CURL) foreach(arch ${PROFILE_TEST_ARCH}) set(PROFILE_TEST_TARGET_ARCH ${arch}) get_test_cc_for_arch(${arch} PROFILE_TEST_TARGET_CC PROFILE_TEST_TARGET_CFLAGS) + # On MSVC, Profile-* tests must link with the same CRT model as the + # clang_rt.profile static archive they exercise. When that archive pulls + # in RTInterception / RTSanitizerCommon object libraries, those are built + # with MultiThreadedDLL (/MD), so the .objs reference __imp_* symbols; + # the test binary defaults to /MT and fails to link (LNK2019 __imp__stricmp + # from interception_win.cpp, LNK4098 default-lib conflicts). Match the + # DLL CRT here so test executables link against the same runtime. + if(MSVC AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL) + string(APPEND PROFILE_TEST_TARGET_CFLAGS + " -D_MT -D_DLL -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames") + endif() set(CONFIG_NAME Profile-${arch}) configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in diff --git a/flang-rt/lib/cuda/memory.cpp b/flang-rt/lib/cuda/memory.cpp index 575d7bbc9c29a..05302ee47e093 100644 --- a/flang-rt/lib/cuda/memory.cpp +++ b/flang-rt/lib/cuda/memory.cpp @@ -30,9 +30,9 @@ struct Memcpy2DLayout { std::size_t pitchBytes; }; -// Get cudaMemcpy2D layout information if both descriptors have equal element -// counts and regular positive-stride layouts. Returns a nullopt otherwise to -// fallback on the runtime assignment. +// Get cudaMemcpy2D layout information for a descriptor that can be represented +// as fixed-pitch rows of widthBytes. Returns nullopt for layouts that need the +// general runtime assignment path. static std::optional GetMemcpy2DLayout( const Descriptor &desc, std::size_t widthBytes) { if (desc.rank() == 0 || desc.Elements() == 0) { @@ -84,13 +84,63 @@ static std::optional GetMemcpy2DLayout( return layout; } +// Collect candidate row widths from the descriptor's leading contiguous +// dimensions, starting with one element. +static int GetContiguousLeadingBytes( + const Descriptor &desc, std::size_t *bytes) { + const auto elemBytes = desc.ElementBytes(); + if (elemBytes == 0) { + return 0; + } + + int count = 0; + bytes[count++] = elemBytes; + std::size_t contiguousBytes = elemBytes; + for (int j = 0; j < desc.rank(); ++j) { + const auto &dim = desc.GetDimension(j); + if (dim.Extent() != 1 && + (dim.ByteStride() < 0 || + static_cast(dim.ByteStride()) != contiguousBytes)) { + break; + } + contiguousBytes *= dim.Extent(); + if (contiguousBytes != bytes[count - 1]) { + bytes[count++] = contiguousBytes; + } + } + return count; +} + +// Choose the largest row width that is contiguous in both descriptors, so +// leading-dimension slices can be copied as wider cudaMemcpy2D rows. +static std::size_t GetMemcpy2DWidthBytes( + const Descriptor &dst, const Descriptor &src) { + std::size_t dstBytes[maxRank + 1]; + std::size_t srcBytes[maxRank + 1]; + const int dstCount = GetContiguousLeadingBytes(dst, dstBytes); + const int srcCount = GetContiguousLeadingBytes(src, srcBytes); + for (int j = dstCount - 1; j >= 0; --j) { + for (int k = srcCount - 1; k >= 0; --k) { + if (dstBytes[j] == srcBytes[k]) { + return dstBytes[j]; + } + } + } + return 0; +} + +// Try to use cudaMemcpy2D for a memcpy of two descriptors, returning true if +// successful. False if the 2D data transfer is not possible. static bool DoMemcpy2D(const Descriptor &dst, const Descriptor &src, cudaMemcpyKind kind, const char *sourceFile, int sourceLine) { if (dst.ElementBytes() != src.ElementBytes() || dst.Elements() != src.Elements()) return false; - std::size_t widthBytes = dst.ElementBytes(); + std::size_t widthBytes = GetMemcpy2DWidthBytes(dst, src); + if (widthBytes == 0) { + return false; + } auto dstLayout = GetMemcpy2DLayout(dst, widthBytes); auto srcLayout = GetMemcpy2DLayout(src, widthBytes); if (!dstLayout || !srcLayout) { diff --git a/flang-rt/unittests/Runtime/CUDA/Memory.cpp b/flang-rt/unittests/Runtime/CUDA/Memory.cpp index 907df0ffb985a..a3aceb884cd83 100644 --- a/flang-rt/unittests/Runtime/CUDA/Memory.cpp +++ b/flang-rt/unittests/Runtime/CUDA/Memory.cpp @@ -160,3 +160,147 @@ TEST(MemoryCUFTest, CUFDataTransferDescDescStrided) { EXPECT_EQ(recvStorage[i * stride + 1], -2); } } + +TEST(MemoryCUFTest, CUFDataTransferDescDescLeadingSliceRank2) { + using Fortran::common::TypeCategory; + static constexpr int nx{8}; + static constexpr int ny{4}; + static constexpr int elements{nx * ny}; + SubscriptValue sliceExtent[]{nx - 2, ny}; + + std::int32_t hostStorage[elements]{}; + for (int j{0}; j < ny; ++j) { + for (int i{1}; i < nx - 1; ++i) { + hostStorage[i + nx * j] = i + 10 * j; + } + } + + std::int32_t *devStorage{static_cast(RTNAME(CUFMemAlloc)( + sizeof(hostStorage), kMemTypeDevice, __FILE__, __LINE__))}; + ASSERT_NE(devStorage, nullptr); + cudaMemset(devStorage, 0xff, sizeof(hostStorage)); + + StaticDescriptor<2> hostStaticDesc; + Descriptor &hostDesc{hostStaticDesc.descriptor()}; + hostDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t), + hostStorage + 1, 2, sliceExtent); + hostDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t)); + hostDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t)); + + StaticDescriptor<2> devStaticDesc; + Descriptor &devDesc{devStaticDesc.descriptor()}; + devDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t), + devStorage + 1, 2, sliceExtent); + devDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t)); + devDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t)); + + RTNAME(CUFDataTransferDescDesc) + (&devDesc, &hostDesc, kHostToDevice, __FILE__, __LINE__); + + std::int32_t result[elements]{}; + RTNAME(CUFDataTransferPtrPtr) + (result, devStorage, sizeof(result), kDeviceToHost, __FILE__, __LINE__); + + std::int32_t recvStorage[elements]{}; + for (int i{0}; i < elements; ++i) { + recvStorage[i] = -2; + } + StaticDescriptor<2> recvStaticDesc; + Descriptor &recvDesc{recvStaticDesc.descriptor()}; + recvDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t), + recvStorage + 1, 2, sliceExtent); + recvDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t)); + recvDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t)); + RTNAME(CUFDataTransferDescDesc) + (&recvDesc, &devDesc, kDeviceToHost, __FILE__, __LINE__); + + RTNAME(CUFMemFree)(devStorage, kMemTypeDevice, __FILE__, __LINE__); + + for (int j{0}; j < ny; ++j) { + EXPECT_EQ(result[nx * j], -1); + EXPECT_EQ(result[nx - 1 + nx * j], -1); + EXPECT_EQ(recvStorage[nx * j], -2); + EXPECT_EQ(recvStorage[nx - 1 + nx * j], -2); + for (int i{1}; i < nx - 1; ++i) { + const int index{i + nx * j}; + EXPECT_EQ(result[index], hostStorage[index]); + EXPECT_EQ(recvStorage[index], hostStorage[index]); + } + } +} + +TEST(MemoryCUFTest, CUFDataTransferDescDescLeadingSlice) { + using Fortran::common::TypeCategory; + static constexpr int nx{8}; + static constexpr int ny{4}; + static constexpr int nz{3}; + static constexpr int elements{nx * ny * nz}; + SubscriptValue sliceExtent[]{nx - 2, ny, nz}; + + std::int32_t hostStorage[elements]{}; + for (int k{0}; k < nz; ++k) { + for (int j{0}; j < ny; ++j) { + for (int i{1}; i < nx - 1; ++i) { + hostStorage[i + nx * (j + ny * k)] = i + 10 * j + 100 * k; + } + } + } + + std::int32_t *devStorage{static_cast(RTNAME(CUFMemAlloc)( + sizeof(hostStorage), kMemTypeDevice, __FILE__, __LINE__))}; + ASSERT_NE(devStorage, nullptr); + cudaMemset(devStorage, 0xff, sizeof(hostStorage)); + + StaticDescriptor<3> hostStaticDesc; + Descriptor &hostDesc{hostStaticDesc.descriptor()}; + hostDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t), + hostStorage + 1, 3, sliceExtent); + hostDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t)); + hostDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t)); + hostDesc.GetDimension(2).SetByteStride(nx * ny * sizeof(std::int32_t)); + + StaticDescriptor<3> devStaticDesc; + Descriptor &devDesc{devStaticDesc.descriptor()}; + devDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t), + devStorage + 1, 3, sliceExtent); + devDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t)); + devDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t)); + devDesc.GetDimension(2).SetByteStride(nx * ny * sizeof(std::int32_t)); + + RTNAME(CUFDataTransferDescDesc) + (&devDesc, &hostDesc, kHostToDevice, __FILE__, __LINE__); + + std::int32_t result[elements]{}; + RTNAME(CUFDataTransferPtrPtr) + (result, devStorage, sizeof(result), kDeviceToHost, __FILE__, __LINE__); + + std::int32_t recvStorage[elements]{}; + for (int i{0}; i < elements; ++i) { + recvStorage[i] = -2; + } + StaticDescriptor<3> recvStaticDesc; + Descriptor &recvDesc{recvStaticDesc.descriptor()}; + recvDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t), + recvStorage + 1, 3, sliceExtent); + recvDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t)); + recvDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t)); + recvDesc.GetDimension(2).SetByteStride(nx * ny * sizeof(std::int32_t)); + RTNAME(CUFDataTransferDescDesc) + (&recvDesc, &devDesc, kDeviceToHost, __FILE__, __LINE__); + + RTNAME(CUFMemFree)(devStorage, kMemTypeDevice, __FILE__, __LINE__); + + for (int k{0}; k < nz; ++k) { + for (int j{0}; j < ny; ++j) { + EXPECT_EQ(result[nx * (j + ny * k)], -1); + EXPECT_EQ(result[nx - 1 + nx * (j + ny * k)], -1); + EXPECT_EQ(recvStorage[nx * (j + ny * k)], -2); + EXPECT_EQ(recvStorage[nx - 1 + nx * (j + ny * k)], -2); + for (int i{1}; i < nx - 1; ++i) { + const int index{i + nx * (j + ny * k)}; + EXPECT_EQ(result[index], hostStorage[index]); + EXPECT_EQ(recvStorage[index], hostStorage[index]); + } + } + } +} diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index 3ff56dbded1d7..385d44b7ced07 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -29,6 +29,42 @@ A list of non-standard directives supported by Flang argument's descriptor and passed as a raw pointer. The letter (P) ignores pointer and allocatable matching, so that one can pass an allocatable array to routine with pointer array argument and vice versa. + The letter (M) disables matching of the actual argument's CUDA storage + (managed/unified) against the dummy's. Its main use is in host modules that + overload the same routine with both a host-typed and a `device`-typed + specific: placing (M) on the device-typed dummy turns that specific into an + overload discriminator. Under `-gpu=mem:unified` or `-gpu=mem:managed`, an + unattributed host actual is normally allowed to bind to a `device` dummy + (the host-to-device attribute check is relaxed). (M) on that dummy opts it + out of the relaxation: an unattributed host actual then binds to the + host-typed specific in the same overload set, while actuals with an + explicit `device`, `managed`, or `unified` attribute continue to bind to + the device-typed specific. For example: +``` + interface compute + module procedure compute_host + module procedure compute_device + end interface +contains + subroutine compute_host(alpha) + real :: alpha + end + subroutine compute_device(alpha) + real, device :: alpha + !dir$ ignore_tkr(m) alpha + end + ! ... + real :: a ! plain host scalar + real, device :: d ! device scalar + call compute(a) ! always binds to compute_host + call compute(d) ! always binds to compute_device +``` + For contrast: without `ignore_tkr(m)` on `compute_device`, + `call compute(a)` compiled with `-gpu=mem:unified` would instead resolve + to `compute_device`, because the matching rules let `a` bind to the + device dummy and rank it as a closer match than the host one (see the + "Attributed Argument Matching Distance Values" table in section 3.2.3 + of the CUDA Fortran Programming Guide). For example, if one wanted to call a "set all bytes to zero" utility that could be applied to arrays of any type or rank: ``` diff --git a/flang/docs/OpenACC.md b/flang/docs/OpenACC.md index 9a166aa9bdde4..720afa5c830e4 100644 --- a/flang/docs/OpenACC.md +++ b/flang/docs/OpenACC.md @@ -33,7 +33,7 @@ local: or module, but it is allowed with a warning when same clause is used. * The OpenACC specification does not prohibit the same variable from appearing in multiple data clauses, but this is disallowed for variables appearing in - `private`, `firstprivate`, or `reduction` clauses. + `reduction` clauses. * The OpenACC specification does not prohibit the same variable from appearing multiple times in a `use_device` clause on a `host_data` construct, but this is disallowed. diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h index 0abfd150cefe0..6893f51c97122 100644 --- a/flang/include/flang/Semantics/semantics.h +++ b/flang/include/flang/Semantics/semantics.h @@ -33,6 +33,7 @@ class IntrinsicTypeDefaultKinds; } namespace Fortran::parser { +struct AccObject; struct Name; struct Program; class AllCookedSources; @@ -336,6 +337,15 @@ class SemanticsContext { void NoteUsedSymbols(const UnorderedSymbolSet &); bool IsSymbolUsed(const Symbol &) const; + // Track same-kind duplicate AccObjects between resolve-directives and + // rewrite-parse-tree (e.g. the second `x` in `private(x, x)`). + void MarkAccObjectDuplicate(const parser::AccObject *o) { + accObjectDuplicates_.insert(o); + } + bool IsAccObjectDuplicate(const parser::AccObject *o) const { + return accObjectDuplicates_.count(o) != 0; + } + void DumpSymbols(llvm::raw_ostream &); // Top-level ProgramTrees are owned by the SemanticsContext for persistence. @@ -395,6 +405,7 @@ class SemanticsContext { std::map moduleFileOutputRenamings_; UnorderedSymbolSet isDefined_; UnorderedSymbolSet isUsed_; + std::set accObjectDuplicates_; std::list programTrees_; }; diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index dad401f0baa74..8ee0613bdfc5f 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -2839,8 +2839,29 @@ static int CompareCudaMatchingDistance( return 0; } -// Compute the matching distance as described in section 3.2.3 of the CUDA -// Fortran references. +// Compute the matching distance for one (dummy, actual) pair as described +// in section 3.2.3 ("Table 2: Attributed Argument Matching Distance Values") +// of the CUDA Fortran Programming Guide. The column applied for the actual +// depends on its CUDA data attribute and (for unattributed actuals) on the +// active -gpu=mem:{unified,managed} mode. +// +// Distance values returned (smaller is a better match; INF means +// incompatible and disqualifies the candidate): +// +// Actual argument attribute +// None ACC gpu= gpu= +// Dummy attr (Host) Device Managed Unified use_dev unified managed +// ----------+--------+-------+--------+-------+--------+--------+--------+ +// None(host)| 0 | INF | 3 | 3 | 3 | 3 | 3 | +// Device | INF | 0 | 2 | 2 | 0 | 2 | 2 | +// Managed | INF | INF | 0 | 1 | INF | 1 | 0 | +// Unified | INF | INF | 1 | 0 | INF | 0 | 1 | +// +// In addition: a dummy declared TYPE(*) (assumed-size/rank opaque buffer) +// is "CUDA address space agnostic" and accepts any attributed actual at a +// non-zero distance (3) so an explicit Device overload still wins. The +// "ACC use_dev" column applies to actuals appearing in a surrounding +// ACC HOST_DATA USE_DEVICE clause. static int GetMatchingDistance(const common::LanguageFeatureControl &features, const characteristics::DummyArgument &dummy, const std::optional &actual) { diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 7c2f26b41a944..02c71cc4babf0 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -384,8 +384,8 @@ class AccAttributeVisitor : DirectiveAttributeVisitor { Symbol *ResolveAccCommonBlockName(const parser::Name *); Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag); - void CheckMultipleAppearances( - const parser::Name &, const Symbol &, Symbol::Flag); + void CheckMultipleAppearances(const parser::Name &, const Symbol &, + Symbol::Flag, const parser::AccObject *occurrence = nullptr); void AllowOnlyArrayAndSubArray(const parser::AccObjectList &objectList); void DoNotAllowAssumedSizedArray(const parser::AccObjectList &objectList); void AllowOnlyVariable(const parser::AccObject &object); @@ -1875,7 +1875,7 @@ void AccAttributeVisitor::ResolveAccObject( if (auto *symbol{ResolveAcc(*name, accFlag, currScope())}) { AddToContextObjectWithDSA(*symbol, accFlag); if (dataSharingAttributeFlags.test(accFlag)) { - CheckMultipleAppearances(*name, *symbol, accFlag); + CheckMultipleAppearances(*name, *symbol, accFlag, &accObject); } } } else { @@ -1940,20 +1940,31 @@ Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity( return &object; } -static bool WithMultipleAppearancesAccException( - const Symbol &symbol, Symbol::Flag flag) { - return false; // Place holder -} - -void AccAttributeVisitor::CheckMultipleAppearances( - const parser::Name &name, const Symbol &symbol, Symbol::Flag accFlag) { +void AccAttributeVisitor::CheckMultipleAppearances(const parser::Name &name, + const Symbol &symbol, Symbol::Flag accFlag, + const parser::AccObject *occurrence) { const auto *target{&symbol}; - if (HasDataSharingAttributeObject(*target) && - !WithMultipleAppearancesAccException(symbol, accFlag)) { - context_.Say(name.source, - "'%s' appears in more than one data-sharing clause " - "on the same OpenACC directive"_err_en_US, - name.ToString()); + if (HasDataSharingAttributeObject(*target)) { + // A same-kind duplicate (e.g. private(x, x) or private(x) private(x)) + // is benign: warn and tag this AccObject occurrence so rewrite-parse-tree + // can drop it from the clause list. Cross-kind duplicates (e.g. + // private(x) firstprivate(x)) remain hard errors. + // + // Reduction is excluded from the benign case: two reduction clauses + // with the same Symbol::Flag may still differ in operator, which is a + // real conflict that dedup would silently hide. + auto firstFlag{GetContext().FindSymbolWithDSA(*target)}; + if (occurrence && firstFlag && *firstFlag == accFlag && + accFlag != Symbol::Flag::AccReduction) { + context_.Warn(common::UsageWarning::OpenAccUsage, name.source, + "'%s' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored"_warn_en_US, + name.ToString()); + context_.MarkAccObjectDuplicate(occurrence); + } else { + context_.Say(name.source, + "'%s' appears in more than one data-sharing clause on the same OpenACC directive"_err_en_US, + name.ToString()); + } } else { AddDataSharingAttributeObject(*target); } diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp index 7352c2a324616..500dd3f225889 100644 --- a/flang/lib/Semantics/rewrite-parse-tree.cpp +++ b/flang/lib/Semantics/rewrite-parse-tree.cpp @@ -62,6 +62,7 @@ class RewriteMutator { void Post(parser::IfConstruct &); void Post(parser::ReadStmt &); void Post(parser::WriteStmt &); + void Post(parser::AccObjectList &); // Name resolution yet implemented: // TODO: Can some/all of these now be enabled? @@ -496,6 +497,15 @@ void RewriteMutator::Post(parser::WriteStmt &x) { FixMisparsedUntaggedNamelistName(x); } +// Erase AccObjects recorded in the context by resolve-directives as same-kind +// data-sharing duplicates. Cross-kind duplicates remain hard errors and never +// reach this pass. +void RewriteMutator::Post(parser::AccObjectList &x) { + x.v.remove_if([this](const parser::AccObject &o) { + return context_.IsAccObjectDuplicate(&o); + }); +} + bool RewriteParseTree(SemanticsContext &context, parser::Program &program) { RewriteMutator mutator{context}; parser::Walk(program, mutator); diff --git a/flang/lib/Support/Fortran.cpp b/flang/lib/Support/Fortran.cpp index d38e7dc051562..39a3d64a464fc 100644 --- a/flang/lib/Support/Fortran.cpp +++ b/flang/lib/Support/Fortran.cpp @@ -148,11 +148,19 @@ bool AreCompatibleCUDADataAttrs(std::optional x, } } else { if (*x == CUDADataAttr::Device) { - if ((y && - (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified || - *y == CUDADataAttr::Shared || - *y == CUDADataAttr::Constant)) || - (!y && (isCudaUnified || isCudaManaged))) { + if (y && + (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified || + *y == CUDADataAttr::Shared || *y == CUDADataAttr::Constant)) { + return true; + } + // A device dummy carrying !dir$ ignore_tkr(m) opts out of the + // -gpu=mem:{unified,managed} relaxation that would otherwise let + // an unattributed host actual bind to it. The (m) letter is used + // by host modules to mark device-typed dummies as overload + // discriminators that should only accept actuals with an explicit + // device/managed/unified attribute. + if (!y && (isCudaUnified || isCudaManaged) && + !ignoreTKR.test(IgnoreTKR::Managed)) { return true; } } else if (*x == CUDADataAttr::Managed) { diff --git a/flang/test/Lower/OpenACC/acc-dedup-private.f90 b/flang/test/Lower/OpenACC/acc-dedup-private.f90 new file mode 100644 index 0000000000000..6399324070831 --- /dev/null +++ b/flang/test/Lower/OpenACC/acc-dedup-private.f90 @@ -0,0 +1,63 @@ +! RUN: bbc -fopenacc -emit-hlfir %s -o - 2>/dev/null | FileCheck %s + +! Check that same-kind duplicate variables in OpenACC private/firstprivate +! clauses lower without failure, and that each variable produces exactly one +! acc.private / acc.firstprivate op (deduplication by rewrite-parse-tree). + +! ----------------------------------------------------------------------- +! private(x, x) -- duplicate within one clause + +subroutine test_private_pair(i) + integer :: x, i + !$acc parallel loop private(x, x) + do i = 1, 10 + end do +end subroutine + +! CHECK-LABEL: func.func @_QPtest_private_pair +! x is privatized exactly once. +! CHECK: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref {name = "x"} +! CHECK-NOT: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref {name = "x"} + +! ----------------------------------------------------------------------- +! private(x, x, x) -- two duplicates (from the triple-occurrence review note) + +subroutine test_private_triple(i) + integer :: x, i + !$acc parallel loop private(x, x, x) + do i = 1, 10 + end do +end subroutine + +! CHECK-LABEL: func.func @_QPtest_private_triple +! x is privatized exactly once even with three source occurrences. +! CHECK: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref {name = "x"} +! CHECK-NOT: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref {name = "x"} + +! ----------------------------------------------------------------------- +! private(x) private(x) -- duplicate across two separate clauses + +subroutine test_private_two_clauses(i) + integer :: x, i + !$acc parallel loop private(x) private(x) + do i = 1, 10 + end do +end subroutine + +! CHECK-LABEL: func.func @_QPtest_private_two_clauses +! CHECK: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref {name = "x"} +! CHECK-NOT: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref {name = "x"} + +! ----------------------------------------------------------------------- +! firstprivate(x, x) + +subroutine test_firstprivate_pair(i) + integer :: x, i + !$acc parallel loop firstprivate(x, x) + do i = 1, 10 + end do +end subroutine + +! CHECK-LABEL: func.func @_QPtest_firstprivate_pair +! CHECK: acc.firstprivate varPtr({{.*}}) recipe(@firstprivatization_ref_i32) -> !fir.ref {name = "x"} +! CHECK-NOT: acc.firstprivate varPtr({{.*}}) recipe(@firstprivatization_ref_i32) -> !fir.ref {name = "x"} diff --git a/flang/test/Parser/acc-dedup-unparse.f90 b/flang/test/Parser/acc-dedup-unparse.f90 new file mode 100644 index 0000000000000..26fa422ff6aad --- /dev/null +++ b/flang/test/Parser/acc-dedup-unparse.f90 @@ -0,0 +1,28 @@ +! RUN: %flang_fc1 -fopenacc -fdebug-unparse -w %s | FileCheck %s + +! Verify that same-kind duplicate variables in OpenACC data-sharing clauses are +! removed by rewrite-parse-tree, so each variable appears at most once when +! unparsed. + +subroutine dedup_pair(x, i) + integer, intent(inout) :: x, i + !$acc parallel loop private(x, x) + do i = 1, 10 + end do +end subroutine +! CHECK-LABEL: SUBROUTINE dedup_pair +! CHECK: PRIVATE(x) +! CHECK-NOT: PRIVATE(x,x) +! CHECK-NOT: PRIVATE(x, x) + +subroutine dedup_triple(x, i) + integer, intent(inout) :: x, i + !$acc parallel loop private(x, x, x) + do i = 1, 10 + end do +end subroutine +! CHECK-LABEL: SUBROUTINE dedup_triple +! Three occurrences reduce to one. +! CHECK: PRIVATE(x) +! CHECK-NOT: PRIVATE(x,x) +! CHECK-NOT: PRIVATE(x, x) diff --git a/flang/test/Semantics/OpenACC/acc-dataclause-dedup.f90 b/flang/test/Semantics/OpenACC/acc-dataclause-dedup.f90 new file mode 100644 index 0000000000000..e29332578a0bf --- /dev/null +++ b/flang/test/Semantics/OpenACC/acc-dataclause-dedup.f90 @@ -0,0 +1,122 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenacc + +! Same-kind data-sharing duplicates on an OpenACC directive (e.g. +! private(x, x), private(x) private(x), copyin(x, x) ...) are not errors: +! resolve-directives warns and rewrite-parse-tree drops the duplicate +! occurrences from the clause object lists. Cross-kind duplicates +! (e.g. private(x) firstprivate(x)) and reduction duplicates remain +! hard errors. + +program test_dataclause_dedup + implicit none + integer :: x, y, z, i + + ! passThis1.f90 pattern: duplicate within a single PRIVATE clause. + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !$acc parallel loop private(x, x) + do i = 1, 10 + end do + + ! passThis2.f90 pattern: duplicate within a single PRIVATE clause across + ! a continuation, with another variable in between. + !$acc parallel loop private(x, & + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !$acc& y, x) + do i = 1, 10 + end do + + ! passThis3.f90 pattern: duplicate across two separate PRIVATE clauses + ! on the same directive. + !$acc parallel loop private(x) & + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !$acc& private(y, x) + do i = 1, 10 + end do + + ! Same patterns generalize to FIRSTPRIVATE. + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !$acc parallel loop firstprivate(x, x) + do i = 1, 10 + end do + + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !$acc parallel loop firstprivate(x) firstprivate(y, x) + do i = 1, 10 + end do + + ! Multiple distinct duplicates on a single directive. + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !WARNING: 'y' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !$acc parallel loop private(x, y, x, y) + do i = 1, 10 + end do + + ! Triple occurrence: two duplicates, both warned, only one survives dedup. + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage] + !$acc parallel loop private(x, x, x) + do i = 1, 10 + end do + + ! Cross-kind duplicates on the same directive remain hard errors. + !ERROR: 'x' appears in more than one data-sharing clause on the same OpenACC directive + !$acc parallel loop private(x) firstprivate(x) + do i = 1, 10 + end do + + ! Reduction is excluded from the benign case: same-flag duplicates may + ! differ in operator, which is a real conflict. + !ERROR: 'x' appears in more than one data-sharing clause on the same OpenACC directive + !$acc parallel loop reduction(+:x) reduction(*:x) + do i = 1, 10 + end do + + ! Regression coverage for non-bare designators: the dedup machinery only + ! examines simple-Name DataRefs, so distinct array elements and array + ! sections must pass through untouched, with no warning and no erasure. + block + integer :: arr(10) + integer, target :: t1, t2 + integer, pointer :: p + type :: pt + integer :: a + integer :: b + end type + type(pt) :: s + + ! Different array elements -- not duplicates. + !$acc parallel loop private(arr(1), arr(2)) + do i = 1, 10 + end do + + ! Different array sections -- not duplicates. + !$acc parallel loop private(arr(1:5), arr(6:10)) + do i = 1, 10 + end do + + ! Same array element listed twice -- not deduped, since GetDesignatorName- + ! IfDataRef returns null for ArrayElement and CheckMultipleAppearances + ! is never invoked. Compiles without diagnostics. + !$acc parallel loop private(arr(1), arr(1)) + do i = 1, 10 + end do + + ! Same array section listed twice -- same reasoning, no diagnostic. + !$acc parallel loop private(arr(1:5), arr(1:5)) + do i = 1, 10 + end do + + ! Distinct structure components -- not duplicates. + !$acc parallel loop private(s%a, s%b) + do i = 1, 10 + end do + + ! Mixing a bare-name designator and an array-element designator on the + ! same symbol must not trigger dedup -- the array element doesn't go + ! through the duplicate check at all. + !$acc parallel loop private(arr, arr(1)) + do i = 1, 10 + end do + end block + +end program diff --git a/flang/test/Semantics/cuf-ignore-tkr-m-error.cuf b/flang/test/Semantics/cuf-ignore-tkr-m-error.cuf new file mode 100644 index 0000000000000..05863c3e1ca50 --- /dev/null +++ b/flang/test/Semantics/cuf-ignore-tkr-m-error.cuf @@ -0,0 +1,32 @@ +! RUN: not bbc -emit-hlfir -fcuda -gpu=unified %s -o /dev/null 2>&1 | FileCheck %s + +! A device-attributed dummy carrying !dir$ ignore_tkr(m) opts out of +! the -gpu=mem:unified relaxation that would otherwise let an +! unattributed host actual bind to it. If a generic exposes ONLY such +! a specific, no viable candidate remains for a plain host actual and +! the call must be diagnosed. +! +! (cuf14.cuf and cuf-ignore-tkr-m-generic.cuf cover the contrasting +! cases where the device dummy has no ignore_tkr(m) and either a host +! specific is present alongside it, or it is the only specific. Those +! flows are unaffected by this carve-out.) + +module m + interface gen_only_device + module procedure sub_device_ignore_m + end interface + +contains + subroutine sub_device_ignore_m(x) + real, device :: x + !dir$ ignore_tkr(m) x + end subroutine +end module + +subroutine caller + use m + real :: a + call gen_only_device(a) +end subroutine + +! CHECK: No specific subroutine of generic 'gen_only_device' matches the actual arguments diff --git a/flang/test/Semantics/cuf-ignore-tkr-m-generic.cuf b/flang/test/Semantics/cuf-ignore-tkr-m-generic.cuf new file mode 100644 index 0000000000000..62e54b35a38a2 --- /dev/null +++ b/flang/test/Semantics/cuf-ignore-tkr-m-generic.cuf @@ -0,0 +1,56 @@ +! RUN: bbc -emit-hlfir -fcuda -gpu=unified %s -o - | FileCheck %s + +! Under -gpu=mem:unified, a device-attributed dummy that carries +! !dir$ ignore_tkr(m) opts out of the relaxation that lets an +! unattributed host actual bind to a device dummy. Such dummies act +! purely as overload discriminators -- the (m) indicates they should +! only be selected when the actual has an explicit +! device/managed/unified attribute. When the same generic also has a +! plain host-typed specific, that host specific must therefore be +! selected for an unattributed host actual. +! +! For comparison, a device specific without ignore_tkr(m) still wins +! over a host specific for an unattributed host actual under +! -gpu=mem:unified (this is what cuf14.cuf already covers, and is +! reproduced here for contrast). + +module m + interface gen_pair + module procedure sub_host + module procedure sub_device_ignore_m + end interface + + interface gen_pair_no_ignore + module procedure sub_host + module procedure sub_device_plain + end interface + +contains + subroutine sub_host(x) + real :: x + end subroutine + + subroutine sub_device_ignore_m(x) + real, device :: x + !dir$ ignore_tkr(m) x + end subroutine + + subroutine sub_device_plain(x) + real, device :: x + end subroutine +end module + +subroutine caller + use m + real :: a, b + ! ignore_tkr(m) on sub_device_ignore_m's dummy opts that specific + ! out of accepting an unattributed host actual -> sub_host wins. + call gen_pair(a) + ! No ignore_tkr(m); the device specific accepts the host actual and + ! is preferred over the host specific (this case is unchanged). + call gen_pair_no_ignore(b) +end subroutine + +! CHECK-LABEL: func.func @_QPcaller +! CHECK: fir.call @_QMmPsub_host +! CHECK: fir.call @_QMmPsub_device_plain diff --git a/flang/test/Semantics/cuf-matching-distance.cuf b/flang/test/Semantics/cuf-matching-distance.cuf new file mode 100644 index 0000000000000..a852aff259ee8 --- /dev/null +++ b/flang/test/Semantics/cuf-matching-distance.cuf @@ -0,0 +1,90 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s --check-prefix=NORM +! RUN: bbc -emit-hlfir -fcuda -gpu=unified %s -o - | FileCheck %s --check-prefix=UNI +! RUN: bbc -emit-hlfir -fcuda -gpu=managed %s -o - | FileCheck %s --check-prefix=MAN + +! Comprehensive coverage of Table 2 ("Attributed Argument Matching +! Distance Values") from CUDA Fortran Programming Guide §3.2.3. +! +! One generic exposes a host, device, managed, and unified specific. +! Each call site picks the winning specific based on the actual's CUDA +! attribute (or, for unattributed actuals, the active -gpu=mem mode). +! +! Actual argument attribute +! None gpu= gpu= +! Dummy attr (Host) Device Managed Unified unified managed +! ----------+--------+-------+--------+--------+--------+--------+ +! None(host)| 0 | INF | 3 | 3 | 3 | 3 | +! Device | INF | 0 | 2 | 2 | 2 | 2 | +! Managed | INF | INF | 0 | 1 | 1 | 0 | +! Unified | INF | INF | 1 | 0 | 0 | 1 | + +module m + interface gen + module procedure sub_host + module procedure sub_device + module procedure sub_managed + module procedure sub_unified + end interface +contains + subroutine sub_host(x) + integer :: x(:) + end subroutine + subroutine sub_device(x) + integer, device :: x(:) + end subroutine + subroutine sub_managed(x) + integer, managed :: x(:) + end subroutine + subroutine sub_unified(x) + integer, unified :: x(:) + end subroutine +end module + +! Test driver: one call per actual attribute. Each compilation mode +! (no flag, -gpu=unified, -gpu=managed) yields a different winner for +! the unattributed allocatable (the "None" actual). +subroutine driver + use m + integer, allocatable :: act_none(:) + integer, device, allocatable :: act_dev(:) + integer, managed, allocatable :: act_man(:) + integer, unified, allocatable :: act_uni(:) + allocate(act_none(4), act_dev(4), act_man(4), act_uni(4)) + + call gen(act_none) + call gen(act_dev) + call gen(act_man) + call gen(act_uni) +end subroutine + +! Without any -gpu=mem mode, an unattributed actual matches the host +! specific (Table 2 column "Actual None (host)"). Explicit Device, +! Managed and Unified actuals each select their corresponding +! specific (distance 0 down the diagonal). +! NORM-LABEL: func.func @_QPdriver +! NORM: fir.call @_QMmPsub_host +! NORM: fir.call @_QMmPsub_device +! NORM: fir.call @_QMmPsub_managed +! NORM: fir.call @_QMmPsub_unified + +! Under -gpu=mem:unified, an unattributed actual matches the unified +! specific (Table 2 column "Actual None (gpu=mem:unified)": Unified=0 +! beats Managed=1, Device=2, Host=3). Explicitly attributed actuals +! still pick their exact-match specific. +! UNI-LABEL: func.func @_QPdriver +! UNI: fir.call @_QMmPsub_unified +! UNI: fir.call @_QMmPsub_device +! UNI: fir.call @_QMmPsub_managed +! UNI: fir.call @_QMmPsub_unified + +! Under -gpu=mem:managed, an unattributed actual matches the managed +! specific (Table 2 column "Actual None (gpu=mem:managed)": Managed=0 +! beats Unified=1, Device=2, Host=3). Explicit Device/Managed/Unified +! actuals are unaffected by the -gpu mode and pick their exact-match +! specific -- in particular, an explicit Unified actual still binds +! to the Unified specific (Unified=0 < Managed=1), matching Table 2. +! MAN-LABEL: func.func @_QPdriver +! MAN: fir.call @_QMmPsub_managed +! MAN: fir.call @_QMmPsub_device +! MAN: fir.call @_QMmPsub_managed +! MAN: fir.call @_QMmPsub_unified diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index d730f2236a3a4..2f095dc67cbcf 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -141,6 +141,8 @@ option(LLVM_LIBC_FULL_BUILD "Build and test LLVM libc as if it is the full libc" option(LLVM_LIBC_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR "Build LLVM libc tests assuming our implementation-defined behavior" ON) option(LLVM_LIBC_ENABLE_LINTING "Enables linting of libc source files" OFF) option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless of whether they are enabled on this target" OFF) +option(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS + "Enable entrypoints with known-incomplete implementations (off by default)" OFF) option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF) diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index e62bc67e2d5ca..e61b127e42102 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -387,7 +387,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.setsid libc.src.unistd.symlink libc.src.unistd.symlinkat - libc.src.unistd.sysconf libc.src.unistd.truncate libc.src.unistd.unlink libc.src.unistd.unlinkat @@ -1265,6 +1264,12 @@ if(LLVM_LIBC_FULL_BUILD) ) endif() +if(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS) + list(APPEND TARGET_LIBC_ENTRYPOINTS + libc.src.unistd.sysconf + ) +endif() + set(TARGET_LIBMVEC_ENTRYPOINTS) if(LIBC_COMPILER_HAS_EXT_VECTOR_TYPE) diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index d1c52dffdb6e7..7a34cc5fba201 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -390,7 +390,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.setsid libc.src.unistd.symlink libc.src.unistd.symlinkat - libc.src.unistd.sysconf libc.src.unistd.truncate libc.src.unistd.unlink libc.src.unistd.unlinkat @@ -1399,6 +1398,12 @@ if(LLVM_LIBC_FULL_BUILD) ) endif() +if(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS) + list(APPEND TARGET_LIBC_ENTRYPOINTS + libc.src.unistd.sysconf + ) +endif() + set(TARGET_LLVMLIBC_ENTRYPOINTS ${TARGET_LIBC_ENTRYPOINTS} ${TARGET_LIBM_ENTRYPOINTS} diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 73b4b3fcd191f..00c94e1e9b5a0 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -408,7 +408,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.setsid libc.src.unistd.symlink libc.src.unistd.symlinkat - libc.src.unistd.sysconf libc.src.unistd.truncate libc.src.unistd.unlink libc.src.unistd.unlinkat @@ -1489,6 +1488,12 @@ if(LLVM_LIBC_FULL_BUILD) ) endif() +if(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS) + list(APPEND TARGET_LIBC_ENTRYPOINTS + libc.src.unistd.sysconf + ) +endif() + set(TARGET_LIBMVEC_ENTRYPOINTS) if(LIBC_COMPILER_HAS_EXT_VECTOR_TYPE) diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py index 03c1af579b018..40db227607ee5 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/builder.py +++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py @@ -259,6 +259,7 @@ def _getDebugInfoArgs(self, debug_info): "debug_names": {"MAKE_DEBUG_NAMES": "YES"}, "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"}, "pdb": {"MAKE_PDB": "YES"}, + "none": {"MAKE_DSYM": "NO", "MAKE_NO_DEBUG_INFO": "YES"}, } # Collect all flags, with later options overriding earlier ones diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index 677124b8738f7..a3c5d94a570d1 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -240,6 +240,10 @@ ifeq "$(OS)" "Windows_NT" DEBUG_INFO_FLAG ?= -gdwarf endif +ifeq "$(MAKE_NO_DEBUG_INFO)" "YES" + DEBUG_INFO_FLAG := -g0 +endif + DEBUG_INFO_FLAG ?= -g CFLAGS ?= $(DEBUG_INFO_FLAG) -O0 diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp index dffbdceffc9cb..d79cd7e51f1b8 100644 --- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp +++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp @@ -28,6 +28,7 @@ #include "Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h" #include "Plugins/ObjectFile/ELF/ObjectFileELF.h" +#include "Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.h" #include "Plugins/Process/elf-core/RegisterUtilities.h" #include "ProcessElfCore.h" #include "ThreadElfCore.h" @@ -199,10 +200,10 @@ Status ProcessElfCore::DoLoadCore() { /// PT_AARCH64_MEMTAG_MTE - Contains AArch64 MTE memory tags for a range of /// Process Address Space. for (const elf::ELFProgramHeader &H : segments) { - DataExtractor data = core->GetSegmentData(H); // Parse thread contexts and auxv structure if (H.p_type == llvm::ELF::PT_NOTE) { + DataExtractor data = core->GetSegmentData(H); if (llvm::Error error = ParseThreadContextsFromNoteSegment(H, data)) return Status::FromError(std::move(error)); } @@ -256,42 +257,43 @@ Status ProcessElfCore::DoLoadCore() { // the main executable using data we found in the core file notes. lldb::ModuleSP exe_module_sp = GetTarget().GetExecutableModule(); if (!exe_module_sp) { - if (!m_nt_file_entries.empty()) { - std::string executable_path = GetMainExecutablePath(); - ModuleSpec exe_module_spec; - exe_module_spec.GetArchitecture() = arch; - exe_module_spec.GetUUID() = FindModuleUUID(executable_path); - exe_module_spec.GetFileSpec().SetFile(executable_path, - FileSpec::Style::native); - if (exe_module_spec.GetFileSpec()) { - exe_module_sp = - GetTarget().GetOrCreateModule(exe_module_spec, true /* notify */); + ModuleSpec exe_module_spec; + if (GetMainExecutableModuleSpec(exe_module_spec)) { + exe_module_sp = + GetTarget().GetOrCreateModule(exe_module_spec, true /* notify */); + if (!exe_module_sp) { + // Create an ELF file from memory for the main executable. The dynamic + // loader requires the main executable so that it can extract the + // DT_DEBUG key/value pair from the dynamic section and get the list + // of shared libraries. + std::optional exe_header = + GetNTFileEntryForExecutableELFHeader(); + if (exe_header) { + if (llvm::Expected module_sp_or_err = + ReadModuleFromMemory(exe_module_spec.GetFileSpec(), + exe_header->start, + exe_header->end - exe_header->start)) + exe_module_sp = *module_sp_or_err; + else + llvm::consumeError(module_sp_or_err.takeError()); + } + // Create a placeholder module for the main executable if we failed to + // create an ELF module from memory. if (!exe_module_sp) { - // Create an ELF file from memory for the main executable. The dynamic - // loader requires the main executable so that it can extract the - // DT_DEBUG key/value pair from the dynamic section and get the list - // of shared libraries. - std::optional exe_header_addr; - - // We need to find its load address - for (const NT_FILE_Entry &file_entry : m_nt_file_entries) { - if (file_entry.path == executable_path) { - exe_header_addr = file_entry.start; - break; - } - } - if (exe_header_addr) { - if (llvm::Expected module_sp_or_err = - ReadModuleFromMemory(exe_module_spec.GetFileSpec(), - *exe_header_addr)) - exe_module_sp = *module_sp_or_err; - else - llvm::consumeError(module_sp_or_err.takeError()); - } + lldb::addr_t load_addr = + exe_header ? exe_header->start : LLDB_INVALID_ADDRESS; + lldb::addr_t size = + exe_header ? (exe_header->end - exe_header->start) : 0; + exe_module_sp = + Module::CreateModuleFromObjectFile( + exe_module_spec, load_addr, size); + if (exe_module_spec.GetPlatformFileSpec()) + exe_module_sp->SetPlatformFileSpec( + exe_module_spec.GetPlatformFileSpec()); } - if (exe_module_sp) - GetTarget().SetExecutableModule(exe_module_sp, eLoadDependentsNo); } + if (exe_module_sp) + GetTarget().SetExecutableModule(exe_module_sp, eLoadDependentsNo); } } return error; @@ -313,30 +315,69 @@ void ProcessElfCore::UpdateBuildIdForNTFileEntries() { } } -std::string ProcessElfCore::GetMainExecutablePath() { - // Always try to read the program name from core file memory first via the - // AUXV_AT_EXECFN entry. This value is the address of a null terminated C - // string that contains the program path. +/// Correctly create a FileSpec from a path found in a core file. +/// +/// This method will guess the path style more intelligently that specifying +/// a native path style since core files can contain paths from a different +/// system than the host system. +static FileSpec CreateFileSpecFromPath(llvm::StringRef path) { + FileSpec::Style path_style = FileSpec::Style::native; + if (auto guessed_style = FileSpec::GuessPathStyle(path)) + path_style = *guessed_style; + return FileSpec(path, path_style); +} + +bool ProcessElfCore::GetMainExecutableModuleSpec(ModuleSpec &exe_spec) { AuxVector aux_vector(m_auxv); - std::string execfn_str; + exe_spec.GetArchitecture() = GetTarget().GetArchitecture(); + + // Find the NT_FILE_Entry for the main executable's ELF header. + std::optional exe_header = + GetNTFileEntryForExecutableELFHeader(); + if (exe_header) { + exe_spec.GetFileSpec() = CreateFileSpecFromPath(exe_header->path); + exe_spec.GetUUID() = FindModuleUUID(exe_header->path); + } + + // If we failed to find the executable program in the NT_FILE list with the + // program header address, then we can read the executable name from the value + // of the AUXV_AT_EXECFN in the AUX vector. The reason we don't use this file + // all of the time is if the program is launched using a symlink, the value of + // the AUXV_AT_EXECFN string will be the symlink itself. The same goes for the + // m_executable_name found in the NT_PRPSINFO section, it will be the name of + // the symlink. Even if we did find a path above, we want to fill in this path + // if it is different from main executable's path in the platform file name + // in case someone needs to know how the executable was launched. if (auto execfn = aux_vector.GetAuxValue(AuxVector::AUXV_AT_EXECFN)) { Status error; - if (ReadCStringFromMemory(*execfn, execfn_str, error)) - return execfn_str; + std::string execfn_str; + if (ReadCStringFromMemory(*execfn, execfn_str, error)) { + // This path can be a symlink path. Set it as the main file spec if one + // hasn't been set, else set the platform file spec. + FileSpec execfn_spec = CreateFileSpecFromPath(execfn_str); + if (exe_spec.GetFileSpec()) { + // Fill in the platform file spec if it differs from the main path from + // the resolved file info in the NT_FILE note. + if (exe_spec.GetFileSpec() != execfn_spec) + exe_spec.GetPlatformFileSpec() = execfn_spec; + } else { + // We don't have an executable file spec yet, lets set it. + exe_spec.GetFileSpec() = execfn_spec; + exe_spec.GetUUID() = FindModuleUUID(execfn_str); + } + } } - if (m_nt_file_entries.empty()) - return {}; - - // The first entry in the NT_FILE might be our executable - std::string executable_path = m_nt_file_entries[0].path; - // Prefer the NT_FILE entry matching m_executable_name as main executable. - for (const NT_FILE_Entry &file_entry : m_nt_file_entries) - if (llvm::StringRef(file_entry.path).ends_with("/" + m_executable_name)) { - executable_path = file_entry.path; - break; - } - return executable_path; + // If we didn't set the executable file spec yet, lets set it from the info + // from the NT_PRPSINFO. This usually is just a basename of the actual path + // used to launch the binary, so this can be a symlink basename. But it will + // be better than nothing since we will create a placeholder module for any + // files that don't exist. + if (!exe_spec.GetFileSpec() && !m_executable_name.empty()) + exe_spec.GetFileSpec() = CreateFileSpecFromPath(m_executable_name); + + // We succeeded if we got a path. + return (bool)exe_spec.GetFileSpec(); } UUID ProcessElfCore::FindModuleUUID(const llvm::StringRef path) { @@ -1167,3 +1208,51 @@ bool ProcessElfCore::GetProcessInfo(ProcessInstanceInfo &info) { info.SetArguments(m_process_args.as_args(), /*first_arg_is_executable=*/true); return true; } + +/// Find the NT_FILE entry that contains an address. +std::optional +ProcessElfCore::GetNTFileEntryContainingAddress(lldb::addr_t addr) { + for (const NT_FILE_Entry &file_entry : m_nt_file_entries) { + if (file_entry.start <= addr && addr < file_entry.end) + return file_entry; + } + return std::nullopt; +} + +std::optional +ProcessElfCore::GetNTFileEntryForExecutableELFHeader() { + /// This method will search for the first NT_FILE entry that contains the + /// executable's ELF header. We use the AUXV_AT_PHDR from the aux vector to + /// find the address of the main executable's program headers and then find + /// the NT_FILE entry that contains this address. + /// + /// Previously we would try to find the first NT_FILE entry that had a path + /// that ended with the executable name found in the NT_PRPSINFO note, but + /// this basename can be the name of a symlink and not the actual resolved + /// executable file found in the NT_FILE entry so this could fail for cases + /// where a symlink was used to launch the program, and that symlink's + /// base name was different from the resolved executable file's name in + /// the NT_FILE entry. + if (m_nt_file_entries.empty()) + return std::nullopt; + // The AUX vector has the load address of the program headers from the main + // executable as the value for AUXV_AT_PHDR. We can use this value to find + // the NT_FILE entry that contains this address and this will locate the main + // executable's mapping that contains the ELF header. + AuxVector aux_vector(m_auxv); + if (std::optional opt_value = + aux_vector.GetAuxValue(AuxVector::AUXV_AT_PHDR)) { + if (std::optional nt = + GetNTFileEntryContainingAddress(*opt_value)) + return *nt; + } + // Fall back to trying to find the first NT_FILE entry that contains the entry + // point address. + if (std::optional opt_value = + aux_vector.GetAuxValue(AuxVector::AUXV_AT_ENTRY)) { + if (std::optional nt = + GetNTFileEntryContainingAddress(*opt_value)) + return *nt; + } + return std::nullopt; +} diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h index 2b6b34075252f..e6f1fa0027554 100644 --- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h +++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h @@ -172,8 +172,8 @@ class ProcessElfCore : public lldb_private::PostMortemProcess { lldb_private::UUID FindModuleUUID(const llvm::StringRef path) override; - // Returns the main executable path. - std::string GetMainExecutablePath(); + // Extract the executable module spec for the executable in this core file. + bool GetMainExecutableModuleSpec(lldb_private::ModuleSpec &exe_spec); // Returns the value of certain type of note of a given start address lldb_private::UUID FindBuidIdInCoreMemory(lldb::addr_t address); @@ -192,6 +192,12 @@ class ProcessElfCore : public lldb_private::PostMortemProcess { llvm::Error parseNetBSDNotes(llvm::ArrayRef notes); llvm::Error parseOpenBSDNotes(llvm::ArrayRef notes); llvm::Error parseLinuxNotes(llvm::ArrayRef notes); + + /// Find the NT_FILE entry that contains an address. + std::optional + GetNTFileEntryContainingAddress(lldb::addr_t addr); + /// Intelligently find the NT_FILE entry for the executable's ELF header. + std::optional GetNTFileEntryForExecutableELFHeader(); }; #endif // LLDB_SOURCE_PLUGINS_PROCESS_ELF_CORE_PROCESSELFCORE_H diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp index 19ae1cf392efa..553998c903eea 100644 --- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp +++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp @@ -41,10 +41,9 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly( ProcessSP process_sp(thread.GetProcess()); if (process_sp) { Status error; - const bool force_live_memory = true; if (process_sp->GetTarget().ReadMemory( range.GetBaseAddress(), function_text.data(), range.GetByteSize(), - error, force_live_memory) != range.GetByteSize()) { + error) != range.GetByteSize()) { return false; } } diff --git a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile index df9f4a7b518c7..99998b20bcb05 100644 --- a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile +++ b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile @@ -1,4 +1,3 @@ CXX_SOURCES := main.cpp -CFLAGS_EXTRAS := -g0 include Makefile.rules diff --git a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py index defea39826267..10dbd3a6953f2 100644 --- a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py +++ b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py @@ -10,9 +10,10 @@ class TestFrameVarDILNoDebugInfo(TestBase): NO_DEBUG_INFO_TESTCASE = True + SHARED_BUILD_TESTCASE = False def test_no_debug_info(self): - self.build() + self.build(debug_info="none") lldbutil.run_to_name_breakpoint(self, "main") self.runCmd("settings set target.experimental.use-DIL true") diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py index 959339c2c6ca0..da7f0ca7f9e71 100644 --- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py +++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py @@ -1101,7 +1101,7 @@ def test_linux_no_exe(self): libraries are available. The "image list" output should look like: (lldb) image list - [ 0] 7BCC1101 0x000055bb04288000 /data/users/gclayton/args/elf-crash (0x000055bb04288000) + [ 0] 9FD61477 0x000055bb04288000 /data/users/gclayton/args/elf-crash (0x000055bb04288000) [ 1] 0x00007f27db200000 /libxx/libstdc++.so.6 [ 2] AF275675-4671-8B49-24C8-A9A657D74115-C80DEE65 0x00007f27db51b000 /libxx/libm.so.6 (0x00007f27db51b000) [ 3] 0x00007f27db4fe000 /libxx/libgcc_s.so.1 @@ -1119,7 +1119,7 @@ def test_linux_no_exe(self): self.assertEqual( m.GetObjectFileHeaderAddress().GetLoadAddress(target), 0x000055BB04288000 ) - self.assertEqual(m.GetUUIDString(), "7BCC1101") + self.assertEqual(m.GetUUIDString(), "9FD61477") m = target.module["/libxx/libstdc++.so.6"] self.assertTrue(m.IsValid()) @@ -1284,6 +1284,55 @@ def do_test(self, filename, pid, region_count, thread_name): self.dbg.DeleteTarget(target) + def test_exe_name_extraction_nt_file(self): + # This core file has: + # - NT_FILE entry for the executable with path '/path/nt_file_foo + # - AT_EXECFN that points to "/path/execfn_foo" + # - NT_PRPSINFO with a pr_fname member set to 'prpsinfo_foo' + # We expect the NT_FILE version to be found since this is a resolved + # file path and it is the best information we can use for the executable + # name. + yaml_path = self.getSourcePath("elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml") + core_path = self.getBuildArtifact("elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.core") + self.yaml2obj(yaml_path, core_path) + target = self.dbg.CreateTarget(None) + process = target.LoadCore(core_path) + exe_module = target.modules[0] + self.assertEqual(exe_module.GetFileSpec().fullpath, "/path/nt_file_foo") + self.dbg.DeleteTarget(target) + + def test_exe_name_extraction_at_execfn(self): + # This core file has: + # - AT_EXECFN that points to "/path/execfn_foo" + # - NT_PRPSINFO with a pr_fname member set to 'prpsinfo_foo' + # There is no NT_FILE in this core file, so we expect the fall back to + # the AT_EXECFN name in memory as it has a full path to the executable. + # This path can differ from the path found in NT_FILE as it might not + # be resolved as it can be a symlink path. + yaml_path = self.getSourcePath("elf-NT_PRPSINFO-AT_EXECFN.yaml") + core_path = self.getBuildArtifact("elf-NT_PRPSINFO-AT_EXECFN.core") + self.yaml2obj(yaml_path, core_path) + target = self.dbg.CreateTarget(None) + process = target.LoadCore(core_path) + exe_module = target.modules[0] + self.assertEqual(exe_module.GetFileSpec().fullpath, "/path/execfn_foo") + self.dbg.DeleteTarget(target) + + def test_exe_name_extraction_nt_prpsinfo(self): + # This core file has: + # - NT_PRPSINFO with a pr_fname member set to 'prpsinfo_foo' + # There is no NT_FILE or AT_EXECFN in the aux vector in this core file. + # We expect the fall back to the info in the NT_PRPSINFO note. + yaml_path = self.getSourcePath("elf-NT_PRPSINFO.yaml") + core_path = self.getBuildArtifact("elf-NT_PRPSINFO.core") + self.yaml2obj(yaml_path, core_path) + target = self.dbg.CreateTarget(None) + process = target.LoadCore(core_path) + exe_module = target.modules[0] + self.assertEqual(exe_module.GetFileSpec().fullpath, "prpsinfo_foo") + self.dbg.DeleteTarget(target) + + def replace_path(binary, replace_from, replace_to): src = replace_from.encode() diff --git a/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml new file mode 100644 index 0000000000000..5bc949ade451f --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml @@ -0,0 +1,29 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_CORE + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_NOTE + Align: 0x4 + FileSize: 0x2c4 + Offset: 0xb0 + - Type: PT_LOAD + Flags: [ PF_R ] + VAddr: 0x10000 + Align: 0x4 + FileSize: 0x11 + MemSize: 0x1f + Offset: 0x374 +Sections: + - Type: Fill + Pattern: 0600000030000000060000004c494e5558000000030000000000000040802804bb5500001f00000000000000000001000000000000000000000000000000000000000000050000005001000001000000434f5245000000000b00000000000000000000000b000000000000000000000000000000000000004a4433005f7220004a4433005f7220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e063db277f0000b8bd2804bb55000030912804bb550000f8191ec0ff7f0000e0181ec0ff7f00000000000000000000a0acffda277f0000e90000000000000060c060db277f000050c4ffda277f000000000000000000000100000000000000081a1ec0ff7f0000f8191ec0ff7f00000100000000000000ffffffffffffffff4e912804bb55000033000000000000004602010000000000e0181ec0ff7f00002b0000000000000040974fdb277f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000050000006a000000454c4946434f5245000000000200000000000000001000000000000000802804bb55000000902804bb55000000000000000000000000e07e147f000000c0e27e147f000020000000000000002f706174682f6e745f66696c655f666f6f002f706174682f6e745f66696c652f6c6962632e736f2e36000000050000008800000003000000434f524500000000024400000000000008044040000000003000000030000000510d0000360d0000d9040000d904000070727073696e666f5f666f6f000000002f706174682f70727073696e666f5f666f6f202d2d766572626f736500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + Size: 0x2c4 + Offset: 0xb0 + - Type: Fill + Pattern: 2f706174682f65786563666e5f666f6f00 + Size: 0x11 + Offset: 0x374 + - Type: SectionHeaderTable + NoHeaders: true diff --git a/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml new file mode 100644 index 0000000000000..c6834dd7dda8f --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml @@ -0,0 +1,29 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_CORE + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_NOTE + Align: 0x4 + FileSize: 0x234 + Offset: 0xb0 + - Type: PT_LOAD + Flags: [ PF_R ] + VAddr: 0x10000 + Align: 0x4 + FileSize: 0x11 + MemSize: 0x1f + Offset: 0x2e4 +Sections: + - Type: Fill + Pattern: 0600000020000000060000004c494e55580000001f00000000000000000001000000000000000000000000000000000000000000050000005001000001000000434f5245000000000b00000000000000000000000b000000000000000000000000000000000000004a4433005f7220004a4433005f7220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e063db277f0000b8bd2804bb55000030912804bb550000f8191ec0ff7f0000e0181ec0ff7f00000000000000000000a0acffda277f0000e90000000000000060c060db277f000050c4ffda277f000000000000000000000100000000000000081a1ec0ff7f0000f8191ec0ff7f00000100000000000000ffffffffffffffff4e912804bb55000033000000000000004602010000000000e0181ec0ff7f00002b0000000000000040974fdb277f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000050000008800000003000000434f524500000000024400000000000008044040000000003000000030000000510d0000360d0000d9040000d904000070727073696e666f5f666f6f000000002f706174682f70727073696e666f5f666f6f202d2d766572626f736500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + Size: 0x234 + Offset: 0xb0 + - Type: Fill + Pattern: 2f706174682f65786563666e5f666f6f00 + Size: 0x11 + Offset: 0x2e4 + - Type: SectionHeaderTable + NoHeaders: true diff --git a/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO.yaml b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO.yaml new file mode 100644 index 0000000000000..99b5c5dde0903 --- /dev/null +++ b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO.yaml @@ -0,0 +1,18 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_CORE + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_NOTE + Align: 0x4 + FileSize: 0x224 + Offset: 0x78 +Sections: + - Type: Fill + Pattern: 0600000010000000060000004c494e555800000000000000000000000000000000000000050000005001000001000000434f5245000000000b00000000000000000000000b000000000000000000000000000000000000004a4433005f7220004a4433005f7220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e063db277f0000b8bd2804bb55000030912804bb550000f8191ec0ff7f0000e0181ec0ff7f00000000000000000000a0acffda277f0000e90000000000000060c060db277f000050c4ffda277f000000000000000000000100000000000000081a1ec0ff7f0000f8191ec0ff7f00000100000000000000ffffffffffffffff4e912804bb55000033000000000000004602010000000000e0181ec0ff7f00002b0000000000000040974fdb277f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000050000008800000003000000434f524500000000024400000000000008044040000000003000000030000000510d0000360d0000d9040000d904000070727073696e666f5f666f6f000000002f706174682f70727073696e666f5f666f6f202d2d766572626f736500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + Size: 0x224 + Offset: 0x78 + - Type: SectionHeaderTable + NoHeaders: true diff --git a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile index 7c3c32d6f82df..99998b20bcb05 100644 --- a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile +++ b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile @@ -1,4 +1,3 @@ CXX_SOURCES := main.cpp -CXXFLAGS_EXTRAS := -g0 include Makefile.rules diff --git a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py index 497c0dd128f48..ddc7498a72e8d 100644 --- a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py +++ b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py @@ -10,8 +10,11 @@ class TestObjCFromCppFramesWithoutDebugInfo(TestBase): + NO_DEBUG_INFO_TESTCASE = True + SHARED_BUILD_TESTCASE = False + def test(self): - self.build() + self.build(debug_info="none") (_, process, _, _) = lldbutil.run_to_name_breakpoint(self, "main") self.assertState(process.GetState(), lldb.eStateStopped) diff --git a/llvm/docs/Instrumentor.rst b/llvm/docs/Instrumentor.rst new file mode 100644 index 0000000000000..b908122599d53 --- /dev/null +++ b/llvm/docs/Instrumentor.rst @@ -0,0 +1,786 @@ +================================== +The LLVM Instrumentor Pass +================================== + +.. contents:: + :local: + +Introduction +============ + +The **Instrumentor** is a highly configurable instrumentation pass for LLVM-IR +that allows users to insert custom runtime function calls at various program +points. Unlike traditional instrumentation tools that are hardcoded for +specific purposes (like sanitizers or profilers), the Instrumentor provides a +flexible, configuration-driven approach where users can specify: + +- **What** to instrument (loads, stores, allocations, function calls, etc.) +- **Where** to instrument (before or after operations) +- **What information** to pass to the runtime (pointers, values, sizes, types, etc.) +- **Whether** to modify program behavior (replace values, redirect pointers, etc.) + +The Instrumentor is designed to support a wide variety of use cases including: + +- Custom memory profilers and trackers +- Performance analysis tools +- Dynamic program analysis +- Debugging and tracing utilities +- Stack usage monitoring +- Custom sanitizers and checkers + +To use the Instrumentor it is recommended to run the wizard script located at +`./llvm/utils/instrumentor-config-wizard.py`. The script will interactively +create a configuration file and a stub runtime which is required to be linked +into the instrumented program. + +Key Features +============ + +Configurable Instrumentation Opportunities +------------------------------------------- + +The Instrumentor supports instrumentation at multiple levels: + +**Instruction-level:** + - **Load instructions**: Instrument memory reads with access to pointer, loaded value, alignment, size, atomicity, etc. + - **Store instructions**: Instrument memory writes with access to pointer, stored value, alignment, size, atomicity, etc. + - **Alloca instructions**: Instrument stack allocations with access to size, alignment, and allocated address + +**Function-level:** + - **Function entry**: Instrument at function start with access to function name, address, arguments, etc. + - **Function exit**: Instrument at function return + +**Future extensions:** + - Basic block entry/exit + - Module-level initialization + - Global variable access + +PRE and POST Instrumentation +----------------------------- + +Each instrumentation opportunity supports two positions: + +- **PRE**: Insert instrumentation **before** the operation occurs + + - For loads: can inspect/modify the pointer before reading + - For stores: can inspect/modify the pointer and value before writing + - For allocas: can modify the allocation size + - For functions: instrument at function entry, inspect/replace arguments + +- **POST**: Insert instrumentation **after** the operation occurs + + - For loads: can inspect/modify the loaded value + - For stores: instrument after the write completes + - For allocas: can inspect/modify the allocated address + - For functions: instrument at function exit + +Selective Argument Passing +--------------------------- + +For each instrumentation opportunity, users can individually enable/disable specific arguments to control: + +- What information is passed to the runtime function +- The signature of the generated runtime function +- Performance overhead (fewer arguments = faster calls) + +For example, for load instrumentation, you can choose to pass: + +- Pointer address +- Pointer address space +- Loaded value +- Value size +- Alignment +- Value type ID +- Atomicity ordering +- Synchronization scope +- Volatility flag +- Unique instrumentation ID + +Value Replacement +----------------- + +The Instrumentor supports **replacing** values returned from the runtime: + +- **Load replacement**: The runtime can provide a different value than what was loaded from memory +- **Store replacement**: The runtime can modify the pointer or value being stored +- **Alloca replacement**: The runtime can provide a different allocation size or replace the allocated address +- **Argument replacement**: The runtime can modify the arguments passed to a function + +This enables use cases like: + +- Value redirection for debugging +- Custom memory allocators +- Fault injection +- Taint tracking + +Instrumentation Filtering +------------------------- + +The Instrumentor provides fine-grained control over what gets instrumented: + +- **Target regex**: Match against the target triple (e.g., ``x86_64-.*-linux``) +- **Host/GPU toggle**: Separately enable/disable CPU and GPU instrumentation +- **Function filtering**: Exclude runtime functions from instrumentation via a regular expression + +Configuration System +==================== + +The Instrumentor uses a JSON-based configuration system that allows users to: + +1. Generate a default configuration showing all available options +2. Interactively customize the configuration using the wizard +3. Load and modify existing configurations +4. Generate runtime stub implementations + +Configuration File Format +------------------------- + +The configuration file is a JSON document with the following structure: + +.. code-block:: json + + { + "configuration": { + "runtime_prefix": "__instrumentor_", + "target_regex": "", + "host_enabled": true, + "gpu_enabled": true + }, + "function_pre": { + "function": { + "enabled": true, + "address": true, + "name": true, + "id": true + } + }, + "instruction_pre": { + "load": { + "enabled": true, + "pointer": true, + "pointer.replace": false, + "value_size": true, + "id": true + }, + "store": { + "enabled": true, + "pointer": true, + "value": true, + "value_size": true + } + }, + "instruction_post": { + "load": { + "enabled": true, + "value": true, + "value.replace": false + } + } + } + +Configuration Sections +---------------------- + +**configuration** + Global settings that apply to all instrumentation: + + - ``runtime_prefix``: Prefix for all runtime function names (default: ``__instrumentor_``) + - ``target_regex``: Regular expression to filter targets (empty = all targets) + - ``host_enabled``: Enable instrumentation for CPU targets (default: true) + - ``gpu_enabled``: Enable instrumentation for GPU targets (default: true) + +**function_pre / function_post** + Function-level instrumentation configuration. + +**instruction_pre / instruction_post** + Instruction-level instrumentation configuration, with subsections for each instruction type (``load``, ``store``, ``alloca``, etc.). + +Argument Configuration +---------------------- + +For each instrumentation opportunity, arguments are configured with: + +- **enabled**: Boolean to enable/disable the entire opportunity +- ****: Boolean to enable/disable passing this argument +- **.replace**: Boolean to enable value replacement (only for replaceable arguments) +- **.description**: Human-readable description of the argument + +The Configuration Wizard +========================= + +The Instrumentor includes an interactive configuration wizard that simplifies the process of creating and modifying configurations. + +Running the Wizard +------------------ + +.. code-block:: bash + + # Run the wizard interactively + ./llvm/utils/instrumentor-config-wizard.py + + # Specify output location + ./llvm/utils/instrumentor-config-wizard.py -o my_config.json + + # Use specific opt binary + ./llvm/utils/instrumentor-config-wizard.py --opt-path /path/to/opt + + # Load and modify existing configuration + ./llvm/utils/instrumentor-config-wizard.py --input existing.json -o modified.json + +Wizard Workflow +--------------- + +The wizard guides you through five steps: + +**Step 1: Select Instrumentation Types** + Choose which types of operations to instrument (load, store, alloca, function, etc.). This is a high-level selection - you can configure individual arguments later. + +**Step 2: PRE vs POST Configuration** + Decide whether PRE and POST instrumentation should use the same configuration or different configurations. This saves time when you want both positions to have identical settings. + +**Step 3: Base Configuration** + Configure global settings: + + - Runtime prefix for function names + - Target regex for filtering + - Enable/disable host (CPU) instrumentation + - Enable/disable GPU instrumentation + +**Step 4: Configure Arguments** + For each enabled instrumentation type, select which arguments to pass to the runtime function. You can: + + - Toggle individual arguments on/off + - Enable value replacement for replaceable arguments + - Enable all or disable all arguments + - Configure PRE and POST separately (if selected in Step 2) + +**Step 5: Review and Save** + Review your configuration and optionally generate runtime stub implementations. The wizard displays a summary and provides commands for using the configuration with ``opt`` and ``clang``. + +Generating Runtime Stubs +------------------------- + +The wizard can automatically generate C stub implementations of your runtime functions: + +1. In Step 5, select 'g' to generate stubs +2. Specify the output file path (default: ``_stubs.c``) +3. The wizard creates a C file with stub implementations that print their arguments + +The generated stubs are useful as: + +- Starting templates for implementing your runtime +- Documentation of the expected function signatures +- Quick prototypes for testing instrumentation + +Example stub output: + +.. code-block:: c + + void __instrumentor_pre_load(void *pointer, int32_t pointer_as, + uint64_t value_size, int32_t id) { + printf("load pre -- pointer: %p, pointer_as: %i, " + "value_size: %lu, id: %i\n", + pointer, pointer_as, value_size, id); + } + +Usage Examples +============== + +Basic Usage with opt +-------------------- + +**Step 1: (Optional) Generate a default configuration** + +.. code-block:: bash + + opt -passes=instrumentor \ + -instrumentor-write-config-file=config.json \ + -disable-output \ + input.ll + +This creates ``config.json`` with all available instrumentation opportunities and their arguments. + +**Step 2: Customize the configuration** + +Edit ``config.json`` manually or use the wizard (no input needed): + +.. code-block:: bash + + ./llvm/utils/instrumentor-config-wizard.py --input config.json -o custom.json + +**Step 3: Apply instrumentation** + +.. code-block:: bash + + opt -passes=instrumentor \ + -instrumentor-read-config-file=custom.json \ + input.ll -S -o instrumented.ll + +The instrumented output contains calls to your runtime functions at the configured program points. + +Using with Clang +---------------- + +To instrument during compilation: + +.. code-block:: bash + + clang -mllvm -enable-instrumentor \ + -mllvm -instrumentor-read-config-file=config.json \ + source.c -o program + +Complete Workflow Example +-------------------------- + +Here's a complete example for creating a simple memory access profiler: + +**1. Create configuration with the wizard:** + +.. code-block:: bash + + ./llvm/utils/instrumentor-config-wizard.py -o memory_profiler.json + + # In the wizard: + # - Enable: load, store + # - Use same config for PRE/POST: yes + # - Base config: keep defaults + # - For load/store: enable pointer, value_size, id + # - Generate stubs: yes (memory_profiler_stubs.c) + +**2. Implement the runtime:** + +.. code-block:: c + + // memory_runtime.c + #include + #include + + static uint64_t load_count = 0; + static uint64_t store_count = 0; + + void __instrumentor_pre_load(void *pointer, uint64_t value_size, + int32_t id) { + load_count++; + printf("Load from %p (size: %lu, id: %d)\n", + pointer, value_size, id); + } + + void __instrumentor_pre_store(void *pointer, uint64_t value_size, + int32_t id) { + store_count++; + printf("Store to %p (size: %lu, id: %d)\n", + pointer, value_size, id); + } + + __attribute__((destructor)) + void print_stats(void) { + printf("Total loads: %lu\n", load_count); + printf("Total stores: %lu\n", store_count); + } + +**3. Instrument and compile:** + +.. code-block:: bash + + # Instrument the program + clang -emit-llvm -S -o program.ll program.c + opt -passes=instrumentor \ + -instrumentor-read-config-file=memory_profiler.json \ + program.ll -S -o program_inst.ll + + # Compile with runtime + clang program_inst.ll memory_runtime.c -o program + +**4. Run and observe:** + +.. code-block:: bash + + ./program + # Output includes: + # Load from 0x7ffc12345678 (size: 4, id: 1) + # Store to 0x7ffc12345680 (size: 8, id: 2) + # ... + # Total loads: 42 + # Total stores: 27 + +Advanced Use Cases +================== + +Stack Usage Profiling +---------------------- + +Configure alloca instrumentation to track stack allocations: + +.. code-block:: json + + { + "instruction_pre": { + "alloca": { + "enabled": true, + "size": true, + "alignment": true, + "id": true + } + }, + "instruction_post": { + "alloca": { + "enabled": true, + "address": true, + "size": true + } + } + } + +Runtime implementation: + +.. code-block:: c + + static uint64_t total_stack_usage = 0; + static uint64_t peak_stack_usage = 0; + static uint64_t current_stack_usage = 0; + + void __instrumentor_post_alloca(void *address, uint64_t size, + int32_t id) { + current_stack_usage += size; + total_stack_usage += size; + if (current_stack_usage > peak_stack_usage) { + peak_stack_usage = current_stack_usage; + } + } + +Value Replacement for Fault Injection +-------------------------------------- + +Use value replacement to inject faults: + +.. code-block:: json + + { + "instruction_post": { + "load": { + "enabled": true, + "value": true, + "value.replace": true, + "pointer": true + } + } + } + +Runtime implementation: + +.. code-block:: c + + // Replace every 1000th loaded value with zero + static uint64_t load_counter = 0; + + uint64_t __instrumentor_post_load(uint64_t value, void *pointer) { + if (++load_counter % 1000 == 0) { + printf("Injecting fault at %p\n", pointer); + return 0; // Return fault value + } + return value; // Return original value + } + +Function-Level Tracing +---------------------- + +Instrument function entry and exit: + +.. code-block:: json + + { + "function_pre": { + "function": { + "enabled": true, + "name": true, + "address": true, + "num_arguments": true + } + }, + "function_post": { + "function": { + "enabled": true, + "name": true + } + } + } + +Runtime implementation: + +.. code-block:: c + + static int call_depth = 0; + + void __instrumentor_pre_function(char *name, void *address, + int32_t num_args, int32_t id) { + printf("%*sEntering %s (%p) with %d args\n", + call_depth * 2, "", name, address, num_args); + call_depth++; + } + + void __instrumentor_post_function(char *name, int32_t id) { + call_depth--; + printf("%*sExiting %s\n", call_depth * 2, "", name); + } + +GPU Instrumentation +------------------- + +The Instrumentor supports GPU targets (AMDGPU and NVPTX). Configure GPU-specific instrumentation: + +.. code-block:: json + + { + "configuration": { + "runtime_prefix": "__gpu_runtime_", + "target_regex": "(amdgcn|nvptx).*", + "host_enabled": false, + "gpu_enabled": true + }, + "instruction_pre": { + "load": { + "enabled": true, + "pointer": true, + "pointer_as": true + } + } + } + +Note that GPU runtime functions must be implemented with appropriate device attributes. + +Implementation Details +====================== + +Generated Runtime Function Signatures +-------------------------------------- + +The Instrumentor generates runtime function names following this pattern: + +.. code-block:: text + + _[_ind] + +Where: + +- ````: Configurable prefix (default: ``__instrumentor_``) +- ````: Either ``pre`` or ``post`` +- ````: Name of the instrumentation opportunity (``load``, ``store``, ``function``, etc.) +- ``_ind``: Optional suffix when indirection is used (see below) + +Examples: + +- ``__instrumentor_pre_load`` +- ``__instrumentor_post_store`` +- ``__instrumentor_pre_function`` +- ``__instrumentor_pre_load_ind`` (with indirection) + +Direct vs Indirect Arguments +----------------------------- + +The Instrumentor uses two modes for passing arguments: + +**Direct mode** (default): + Arguments are passed by value. This is efficient but requires that all arguments fit in registers or can be passed through the stack efficiently. + +**Indirect mode**: + Arguments are passed by pointer. This is used automatically when: + + - Multiple replaceable arguments are enabled (requires indirection for all replaceable args) + - An argument's value is too large (aggregate types, large values) + +When indirect mode is used, a separate function with the ``_ind`` suffix is generated: + +.. code-block:: c + + // Direct mode + void __instrumentor_pre_load(void *pointer, uint64_t value_size); + + // Indirect mode (automatically generated when needed) + void __instrumentor_pre_load_ind(void **pointer, uint32_t pointer_size, + void *value_size, uint32_t value_size_size); + +Users typically don't need to worry about this - the Instrumentor handles it automatically and the wizard-generated stubs show the correct signatures. + +Unique IDs +---------- + +When the ``id`` argument is enabled, the Instrumentor assigns a unique 32-bit integer to each instrumentation call site: + +- PRE positions get positive IDs (1, 2, 3, ...) +- POST positions get negative IDs (-1, -2, -3, ...) +- IDs are consistent across multiple runs + +Caching +------- + +The Instrumentor caches certain argument values between PRE and POST calls when possible: + +- Values computed in PRE are reused in POST (e.g., pointer value) +- This reduces overhead and ensures consistency + +Runtime Function Requirements +------------------------------ + +Runtime functions must be: + +- Defined with external linkage +- Fast and non-blocking (to minimize instrumentation overhead) +- Thread-safe if the program is multi-threaded + +Runtime functions **must not**: + +- Call back into instrumented code (to avoid infinite recursion) + +Performance Considerations +========================== + +Overhead Factors +---------------- + +Instrumentation overhead depends on: + +1. **Number of instrumentation points**: More instrumented operations = more overhead +2. **Number of arguments passed**: Each argument adds instructions and register pressure +3. **Runtime function complexity**: Complex runtime logic increases overhead +4. **Frequency of instrumented operations**: Instrumenting hot loops has high impact + +Optimization Tips +----------------- + +**Minimize arguments:** + Only enable arguments you actually need. Passing fewer arguments reduces overhead. + +**Use PRE or POST, not both:** + If you only need one position, disable the other. + +**Target filtering:** + Use ``target_regex`` to instrument only specific targets or modules. + +**Efficient runtime:** + Keep runtime functions simple and fast. Consider: + + - Lock-free data structures + - Thread-local storage + - Batching outputs instead of per-call I/O + - Sampling (instrument 1 in N calls) + +**Build with optimizations:** + Use ``-O2`` or ``-O3`` when compiling instrumented code. LLVM can optimize away some overhead. + +Troubleshooting +=============== + +Common Issues +------------- + +**"Could not find 'opt' binary"** + The wizard can't locate the opt binary. + + - Specify the path: ``--opt-path /path/to/opt`` + +**"Indirection needed but not indicated"** + An argument value is too large for direct passing. The Instrumentor handles this automatically, but you might see this warning. It's usually harmless - the indirect version of the function will be generated. + +**Infinite recursion / stack overflow** + Your runtime function is calling back into instrumented code. Solutions: + + - Ensure runtime functions don't trigger more instrumentation + +**Linking errors** + Runtime functions are undefined. You must: + + - Implement all enabled runtime functions + - Link the runtime implementation with your program + - Use the exact function signatures (check generated stubs) + +**Unexpected instrumentation** + More instrumentation than expected. Check: + + - The ``enabled`` flag for each opportunity + - ``host_enabled`` / ``gpu_enabled`` settings + - ``target_regex`` matches your target + - Runtime functions aren't being instrumented (they should be automatically excluded) + +Debugging Instrumented Code +---------------------------- + +**View instrumented IR:** + +.. code-block:: bash + + opt -passes=instrumentor \ + -instrumentor-read-config-file=config.json \ + input.ll -S -o output.ll + + # Examine output.ll to see inserted calls + +**Print configuration:** + +.. code-block:: bash + + opt -passes=instrumentor \ + -instrumentor-write-config-file=debug_config.json \ + input.ll -disable-output + + # Examine debug_config.json to see all options + +**Verify IR:** + The Instrumentor automatically verifies the module after instrumentation. If verification fails, there's a bug in the Instrumentor or the configuration is invalid. + +**Use debug builds:** + Build LLVM with assertions enabled (``-DLLVM_ENABLE_ASSERTIONS=ON``) to catch issues early. + +Extending the Instrumentor +=========================== + +The Instrumentor is designed to be extensible. To add new instrumentation opportunities: + +1. **Define the opportunity class** inheriting from ``InstrumentationOpportunity`` +2. **Implement getter/setter functions** for the arguments +3. **Add initialization** to populate the opportunity with arguments +4. **Register** the opportunity in ``InstrumentationConfig::populate()`` +5. **Add tests** in ``llvm/test/Transforms/Instrumentor/`` + +See ``llvm/lib/Transforms/IPO/Instrumentor.cpp`` and ``llvm/include/llvm/Transforms/IPO/Instrumentor.h`` for examples (``LoadIO``, ``StoreIO``). + +Future instrumentation opportunities being considered: + +- Basic block entry/exit +- Branch instrumentation +- Call instructions +- Atomic operations +- Vector operations +- Exception handling +- Global variable access + +Reference +========= + +Command-Line Options +-------------------- + +**-instrumentor-read-config-file=** + Load instrumentation configuration from the specified JSON file. + +**-instrumentor-write-config-file=** + Write the default instrumentation configuration to the specified JSON file (useful for generating templates). + +Related Passes +-------------- + +The Instrumentor is more flexible but related to: + +- **AddressSanitizer**: Specialized memory error detector +- **ThreadSanitizer**: Race condition detector +- **MemorySanitizer**: Uninitialized memory detector +- **DataFlowSanitizer**: Taint tracking +- **XRay**: Function call tracing with low overhead + +The Instrumentor can implement similar functionality with custom runtime code, but specialized passes may have better performance for their specific use cases. + +Further Reading +--------------- + +- Source code: ``llvm/lib/Transforms/IPO/Instrumentor.cpp`` +- Header: ``llvm/include/llvm/Transforms/IPO/Instrumentor.h`` +- Configuration wizard: ``llvm/utils/instrumentor-config-wizard.py`` diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst index 1f42adaf6b6df..d417559ba31b3 100644 --- a/llvm/docs/TableGen/ProgRef.rst +++ b/llvm/docs/TableGen/ProgRef.rst @@ -2008,6 +2008,21 @@ and non-0 as true. This operator produces the size of the string, list, or dag *a*. The size of a DAG is the number of arguments; the operator does not count. +``!sort(``\ *var*\ ``,`` *list*\ ``,`` *key*\ ``)`` + This operator creates a new ``list`` containing the same elements as *list* + but in sorted order. To determine the order, TableGen binds the variable + *var* to each element and evaluates the *key* expression, which presumably + refers to *var*. The key must produce a ``string`` or integer value + (``bit``, ``bits``, or ``int``); all keys must be of the same type. Elements + with equal keys preserve their original relative order, resulting in a + stable sort. + + For example, to sort a list of records by their ``Name`` field:: + + .. code-block:: text + + list sorted = !sort(t, Things, t.Name); + ``!sra(``\ *a*\ ``,`` *count*\ ``)`` This operator shifts *a* right arithmetically by *count* bits and produces the resulting value. The operation is performed on a 64-bit integer; the result diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst index 375098e5839a1..d544bed9762b1 100644 --- a/llvm/docs/UserGuides.rst +++ b/llvm/docs/UserGuides.rst @@ -326,6 +326,7 @@ intermediate LLVM representation. Remarks SourceLevelDebugging HowToUpdateDebugInfo + Instrumentor InstrRefDebugInfo RemoveDIsDebugInfo KeyInstructionsDebugInfo @@ -503,6 +504,10 @@ Optimizations This document specifies guidelines for contributions for InstCombine and related passes. +:doc:`Instrumentor` + A comprehensive guide to the highly configurable Instrumentor pass for custom + program instrumentation, including the interactive configuration wizard. + Code Generation --------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index dc7676671225b..43ffedd67f213 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3788,27 +3788,27 @@ def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">, PureIntrinsic<[llvm_i16_ty], [llvm_i32_ty]>; // llvm.amdgcn.permlane.bcast -def int_amdgcn_permlane_bcast : ClangBuiltin<"__builtin_amdgcn_permlane_bcast">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_bcast : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.up -def int_amdgcn_permlane_up : ClangBuiltin<"__builtin_amdgcn_permlane_up">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_up : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.down -def int_amdgcn_permlane_down : ClangBuiltin<"__builtin_amdgcn_permlane_down">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_down : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.xor -def int_amdgcn_permlane_xor : ClangBuiltin<"__builtin_amdgcn_permlane_xor">, - Intrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], +def int_amdgcn_permlane_xor : + Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // llvm.amdgcn.permlane.idx.gen diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index cb2721aba4f25..ce03995bc46c8 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -968,6 +968,7 @@ class TernOpInit final : public OpInit, public FoldingSetNode { FIND, SETDAGARG, SETDAGNAME, + SORT, }; private: diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 4eaa9c1c7e98f..7fddba776c694 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -1067,11 +1067,12 @@ bool LLParser::skipModuleSummaryEntry() { // support is in place we will look for the tokens corresponding to the // expected tags. if (Lex.getKind() != lltok::kw_gv && Lex.getKind() != lltok::kw_module && - Lex.getKind() != lltok::kw_typeid && Lex.getKind() != lltok::kw_flags && - Lex.getKind() != lltok::kw_blockcount) - return tokError( - "Expected 'gv', 'module', 'typeid', 'flags' or 'blockcount' at the " - "start of summary entry"); + Lex.getKind() != lltok::kw_typeid && + Lex.getKind() != lltok::kw_typeidCompatibleVTable && + Lex.getKind() != lltok::kw_flags && Lex.getKind() != lltok::kw_blockcount) + return tokError("Expected 'gv', 'module', 'typeid', " + "'typeidCompatibleVTable', 'flags' or 'blockcount' at the " + "start of summary entry"); if (Lex.getKind() == lltok::kw_flags) return parseSummaryIndexFlags(); if (Lex.getKind() == lltok::kw_blockcount) diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 3395d2dd10a1b..a043119006312 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -810,7 +810,7 @@ std::string ListInit::getAsString() const { } const Init *OpInit::getBit(unsigned Bit) const { - if (getType() == BitRecTy::get(getRecordKeeper())) + if (isa(getType())) return this; return VarBitInit::get(this, Bit); } @@ -1788,6 +1788,57 @@ static const Init *FilterHelper(const Init *LHS, const Init *MHS, return nullptr; } +static const Init *SortHelper(const Init *LHS, const Init *MHS, const Init *RHS, + const RecTy *Type, const Record *CurRec) { + const auto *MHSl = dyn_cast(MHS); + if (!MHSl) + return nullptr; + + RecordKeeper &RK = LHS->getRecordKeeper(); + using KV = std::pair; + SmallVector KeyedList; + + for (const Init *Item : MHSl->getElements()) { + const Init *Key = ItemApply(LHS, Item, RHS, CurRec); + if (!Key) + return nullptr; + KeyedList.emplace_back(Key, Item); + } + + if (KeyedList.empty()) + return ListInit::get({}, cast(Type)->getElementType()); + + // Determine key type from the first element; all keys must agree. + bool UseInt = + dyn_cast_or_null(KeyedList[0].first->convertInitializerTo( + IntRecTy::get(RK))) != nullptr; + for (auto &[Key, Item] : KeyedList) { + if (UseInt) { + if (!dyn_cast_or_null( + Key->convertInitializerTo(IntRecTy::get(RK)))) + return nullptr; + } else { + if (!isa(Key)) + return nullptr; + } + } + + llvm::stable_sort(KeyedList, [&RK, UseInt](const KV &A, const KV &B) { + if (UseInt) + return cast(A.first->convertInitializerTo(IntRecTy::get(RK))) + ->getValue() < + cast(B.first->convertInitializerTo(IntRecTy::get(RK))) + ->getValue(); + return cast(A.first)->getValue() < + cast(B.first)->getValue(); + }); + + SmallVector Result; + for (auto &[Key, Item] : KeyedList) + Result.push_back(Item); + return ListInit::get(Result, cast(Type)->getElementType()); +} + const Init *TernOpInit::Fold(const Record *CurRec) const { RecordKeeper &RK = getRecordKeeper(); switch (getOpcode()) { @@ -1845,6 +1896,12 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { break; } + case SORT: { + if (const Init *Result = SortHelper(LHS, MHS, RHS, getType(), CurRec)) + return Result; + break; + } + case IF: { if (const auto *LHSi = dyn_cast_or_null( LHS->convertInitializerTo(IntRecTy::get(RK)))) { @@ -2004,7 +2061,7 @@ const Init *TernOpInit::resolveReferences(Resolver &R) const { const Init *mhs = MHS->resolveReferences(R); const Init *rhs; - if (getOpcode() == FOREACH || getOpcode() == FILTER) { + if (getOpcode() == FOREACH || getOpcode() == FILTER || getOpcode() == SORT) { ShadowResolver SR(R); SR.addShadow(lhs); rhs = RHS->resolveReferences(SR); @@ -2025,6 +2082,10 @@ std::string TernOpInit::getAsString() const { case DAG: Result = "!dag"; break; case FILTER: Result = "!filter"; UnquotedLHS = true; break; case FOREACH: Result = "!foreach"; UnquotedLHS = true; break; + case SORT: + Result = "!sort"; + UnquotedLHS = true; + break; case IF: Result = "!if"; break; case RANGE: Result = "!range"; @@ -2329,7 +2390,7 @@ const RecTy *TypedInit::getFieldType(const StringInit *FieldName) const { } const Init *TypedInit::convertInitializerTo(const RecTy *Ty) const { - if (getType() == Ty || getType()->typeIsA(Ty)) + if (getType()->typeIsA(Ty)) return this; if (isa(getType()) && isa(Ty) && @@ -2358,7 +2419,7 @@ TypedInit::convertInitializerBitRange(ArrayRef Bits) const { const Init *TypedInit::getCastTo(const RecTy *Ty) const { // Handle the common case quickly - if (getType() == Ty || getType()->typeIsA(Ty)) + if (getType()->typeIsA(Ty)) return this; if (const Init *Converted = convertInitializerTo(Ty)) { @@ -2392,7 +2453,7 @@ StringRef VarInit::getName() const { } const Init *VarInit::getBit(unsigned Bit) const { - if (getType() == BitRecTy::get(getRecordKeeper())) + if (isa(getType())) return this; return VarBitInit::get(this, Bit); } @@ -2585,7 +2646,7 @@ const FieldInit *FieldInit::get(const Init *R, const StringInit *FN) { } const Init *FieldInit::getBit(unsigned Bit) const { - if (getType() == BitRecTy::get(getRecordKeeper())) + if (isa(getType())) return this; return VarBitInit::get(this, Bit); } @@ -2871,7 +2932,7 @@ StringRef RecordVal::getName() const { } std::string RecordVal::getPrintType() const { - if (getType() == StringRecTy::get(getRecordKeeper())) { + if (isa(getType())) { if (const auto *StrInit = dyn_cast(Value)) { if (StrInit->hasCodeFormat()) return "code"; diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index 3c88f107f790a..be642b30261c6 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -676,6 +676,7 @@ tgtok::TokKind TGLexer::LexExclaim() { .Case("listsplat", tgtok::XListSplat) .Case("listremove", tgtok::XListRemove) .Case("range", tgtok::XRange) + .Case("sort", tgtok::XSort) .Case("strconcat", tgtok::XStrConcat) .Case("initialized", tgtok::XInitialized) .Case("interleave", tgtok::XInterleave) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index a0ade6412024e..4490ed55f37ef 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -156,6 +156,7 @@ enum TokKind { XToLower, XToUpper, XRange, + XSort, XGetDagArg, XGetDagName, XSetDagArg, diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index c44e067a9da9f..8d9890cea18e7 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -1941,8 +1941,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } case tgtok::XForEach: - case tgtok::XFilter: { - return ParseOperationForEachFilter(CurRec, ItemType); + case tgtok::XFilter: + case tgtok::XSort: { + return ParseOperationListComprehension(CurRec, ItemType); } case tgtok::XRange: { @@ -2571,12 +2572,13 @@ const Init *TGParser::ParseOperationFind(Record *CurRec, return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec); } -/// Parse the !foreach and !filter operations. Return null on error. +/// Parse the !foreach, !filter, and !sort operations. Return null on error. /// /// ForEach ::= !foreach(ID, list-or-dag, expr) => list -/// Filter ::= !foreach(ID, list, predicate) ==> list -const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, - const RecTy *ItemType) { +/// Filter ::= !filter(ID, list, predicate) ==> list +/// Sort ::= !sort(ID, list, key-expr) ==> list +const Init *TGParser::ParseOperationListComprehension(Record *CurRec, + const RecTy *ItemType) { SMLoc OpLoc = Lex.getLoc(); tgtok::TokKind Operation = Lex.getCode(); Lex.Lex(); // eat the operation @@ -2628,9 +2630,19 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, InEltType = InListTy->getElementType(); if (ItemType) { if (const auto *OutListTy = dyn_cast(ItemType)) { - ExprEltType = (Operation == tgtok::XForEach) - ? OutListTy->getElementType() - : IntRecTy::get(Records); + switch (Operation) { + case tgtok::XForEach: + ExprEltType = OutListTy->getElementType(); + break; + case tgtok::XFilter: + ExprEltType = IntRecTy::get(Records); + break; + case tgtok::XSort: + ExprEltType = nullptr; + break; + default: + llvm_unreachable("unexpected token"); + } } else { Error(OpLoc, "expected value of type '" + Twine(ItemType->getAsString()) + @@ -2639,9 +2651,17 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, } } } else if (const auto *InDagTy = dyn_cast(MHSt->getType())) { - if (Operation == tgtok::XFilter) { + switch (Operation) { + case tgtok::XFilter: TokError("!filter must have a list argument"); return nullptr; + case tgtok::XSort: + TokError("!sort must have a list argument"); + return nullptr; + case tgtok::XForEach: + break; + default: + llvm_unreachable("unexpected token"); } InEltType = InDagTy; if (ItemType && !isa(ItemType)) { @@ -2651,11 +2671,19 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, } IsDAG = true; } else { - if (Operation == tgtok::XForEach) + switch (Operation) { + case tgtok::XForEach: TokError("!foreach must have a list or dag argument"); - else + return nullptr; + case tgtok::XFilter: TokError("!filter must have a list argument"); - return nullptr; + return nullptr; + case tgtok::XSort: + TokError("!sort must have a list argument"); + return nullptr; + default: + llvm_unreachable("unexpected token"); + } } // We need to create a temporary record to provide a scope for the @@ -2680,22 +2708,34 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec, return nullptr; } - const RecTy *OutType = InEltType; - if (Operation == tgtok::XForEach && !IsDAG) { - const auto *RHSt = dyn_cast(RHS); - if (!RHSt) { - TokError("could not get type of !foreach result expression"); - return nullptr; + const RecTy *OutType; + TernOpInit::TernaryOp Opc; + switch (Operation) { + case tgtok::XForEach: + Opc = TernOpInit::FOREACH; + if (IsDAG) { + OutType = InEltType; + } else { + const auto *RHSt = dyn_cast(RHS); + if (!RHSt) { + TokError("could not get type of !foreach result expression"); + return nullptr; + } + OutType = RHSt->getType()->getListTy(); } - OutType = RHSt->getType()->getListTy(); - } else if (Operation == tgtok::XFilter) { + break; + case tgtok::XFilter: + Opc = TernOpInit::FILTER; + OutType = InEltType->getListTy(); + break; + case tgtok::XSort: + Opc = TernOpInit::SORT; OutType = InEltType->getListTy(); + break; + default: + llvm_unreachable("unexpected token"); } - - return (TernOpInit::get((Operation == tgtok::XForEach) ? TernOpInit::FOREACH - : TernOpInit::FILTER, - LHS, MHS, RHS, OutType)) - ->Fold(CurRec); + return (TernOpInit::get(Opc, LHS, MHS, RHS, OutType))->Fold(CurRec); } const Init *TGParser::ParseOperationCond(Record *CurRec, diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h index 9f0b89f080c9e..3e7cd2c48b56a 100644 --- a/llvm/lib/TableGen/TGParser.h +++ b/llvm/lib/TableGen/TGParser.h @@ -326,8 +326,8 @@ class TGParser { const Init *ParseOperation(Record *CurRec, const RecTy *ItemType); const Init *ParseOperationSubstr(Record *CurRec, const RecTy *ItemType); const Init *ParseOperationFind(Record *CurRec, const RecTy *ItemType); - const Init *ParseOperationForEachFilter(Record *CurRec, - const RecTy *ItemType); + const Init *ParseOperationListComprehension(Record *CurRec, + const RecTy *ItemType); const Init *ParseOperationCond(Record *CurRec, const RecTy *ItemType); const RecTy *ParseOperatorType(); const Init *ParseObjectName(MultiClass *CurMultiClass); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 4e4c7951b0dcc..46bb9649b12d7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -8260,7 +8260,8 @@ generateGatherLanePattern(MachineInstr &Root, NewRegister) .addReg(SrcRegister) .addImm(Lane) - .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState)); + .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState)) + .setMemRefs(OriginalInstr->memoperands()); InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); InsInstrs.push_back(LoadIndexIntoRegister); return NewRegister; @@ -8268,9 +8269,9 @@ generateGatherLanePattern(MachineInstr &Root, // Helper to create load instruction based on the NumLanes in the NEON // register we are rewriting. - auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg, - Register OffsetReg, - bool KillState) -> MachineInstrBuilder { + auto CreateLDRInstruction = + [&](unsigned NumLanes, Register DestReg, Register OffsetReg, + ArrayRef MMOs) -> MachineInstrBuilder { unsigned Opcode; switch (NumLanes) { case 4: @@ -8289,7 +8290,8 @@ generateGatherLanePattern(MachineInstr &Root, // Immediate offset load return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) .addReg(OffsetReg) - .addImm(0); + .addImm(0) + .setMemRefs(MMOs); }; // Load the remaining lanes into register 0. @@ -8319,7 +8321,7 @@ generateGatherLanePattern(MachineInstr &Root, MachineInstrBuilder MiddleIndexLoadInstr = CreateLDRInstruction(NumLanes, DestRegForMiddleIndex, OriginalSplitToLoadOffsetOperand.getReg(), - OriginalSplitToLoadOffsetOperand.isKill()); + OriginalSplitLoad->memoperands()); InstrIdxForVirtReg.insert( std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index d65ffb1c36814..e814316b0f2ed 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -969,6 +969,26 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // We only care about the mapping of the destination for COPY. /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1); } + case TargetOpcode::G_CONSTANT: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + TypeSize Size = DstTy.getSizeInBits(); + if (!DstTy.isPointer() && (!DstTy.isScalar() || Size < 32 || Size > 64)) + break; + // Scalar constants materialize in GPRs. + [[fallthrough]]; + } + case TargetOpcode::G_BRCOND: + case TargetOpcode::G_FRAME_INDEX: { + // Operand 0 is the only banked operand and is mapped to GPR. + return getInstructionMapping( + DefaultMappingID, /*Cost=*/1, + getOperandsMapping( + {getValueMapping( + PMI_FirstGPR, + MRI.getType(MI.getOperand(0).getReg()).getSizeInBits()), + nullptr}), + /*NumOperands=*/2); + } default: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8e38532fe315c..1a0e4f2eaa416 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6130,6 +6130,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, IID == Intrinsic::amdgcn_permlanex16; bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || IID == Intrinsic::amdgcn_set_inactive_chain_arg; + bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast || + IID == Intrinsic::amdgcn_permlane_up || + IID == Intrinsic::amdgcn_permlane_down || + IID == Intrinsic::amdgcn_permlane_xor; auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, Register Src2, LLT VT) -> Register { @@ -6143,6 +6147,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, case Intrinsic::amdgcn_set_inactive_chain_arg: return LaneOp.addUse(Src1).getReg(0); case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: return LaneOp.addUse(Src1).addUse(Src2).getReg(0); case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: { @@ -6174,9 +6182,11 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, Register Src0 = MI.getOperand(2).getReg(); Register Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { + IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 || + IsPermlaneShuffle) { Src1 = MI.getOperand(3).getReg(); - if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { + if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 || + IsPermlaneShuffle) { Src2 = MI.getOperand(4).getReg(); } } @@ -8451,6 +8461,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_set_inactive_chain_arg: case Intrinsic::amdgcn_mov_dpp8: case Intrinsic::amdgcn_update_dpp: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: return legalizeLaneOp(Helper, MI, IntrID); case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 36aeef3558672..ca29635578945 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1704,10 +1704,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForIOpcs({amdgcn_permlane_bcast, amdgcn_permlane_up, amdgcn_permlane_down, amdgcn_permlane_xor}, - Standard) - .Div(S32, - {{Vgpr32}, - {IntrId, Vgpr32, SgprB32_ReadFirstLane, SgprB32_ReadFirstLane}}); + StandardB) + .Div(B32, + {{VgprB32}, + {IntrId, VgprB32, SgprB32_ReadFirstLane, SgprB32_ReadFirstLane}}); addRulesForIOpcs({amdgcn_permlane_idx_gen}, Standard) .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, SgprB32_ReadFirstLane}}); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index cc8cea8f8411c..e15cc2c072334 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -86,6 +86,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool hasFPModifiers() const { return Abs || Neg; } bool hasIntModifiers() const { return Sext; } bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); } + bool isForcedLit() const { return Lit == LitModifier::Lit; } bool isForcedLit64() const { return Lit == LitModifier::Lit64; } int64_t getFPModifiersOperand() const { @@ -1053,6 +1054,10 @@ class AMDGPUOperand : public MCParsedAsmOperand { return getModifiers().hasIntModifiers(); } + bool isForcedLit() const { + return isImmLiteral() && getModifiers().isForcedLit(); + } + bool isForcedLit64() const { return isImmLiteral() && getModifiers().isForcedLit64(); } @@ -5133,11 +5138,12 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst, Imm = getLitValue(MO.getExpr()); bool IsAnotherLiteral = false; + bool IsForcedLit = findMCOperand(Operands, OpIdx).isForcedLit(); bool IsForcedLit64 = findMCOperand(Operands, OpIdx).isForcedLit64(); if (!Imm.has_value()) { // Literal value not known, so we conservately assume it's different. IsAnotherLiteral = true; - } else if (IsForcedLit64 || !isInlineConstant(Inst, OpIdx)) { + } else if (IsForcedLit || IsForcedLit64 || !isInlineConstant(Inst, OpIdx)) { uint64_t Value = *Imm; bool IsForcedFP64 = Desc.operands()[OpIdx].OperandType == AMDGPU::OPERAND_KIMM64 || diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index cd56887fd46a8..825634d7af65b 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -53,6 +53,8 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); + void hintTrue16Copy(const MachineInstr &MI); + bool optimizeBVHStack(MachineInstr &MI); public: GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} @@ -238,6 +240,65 @@ GCNPreRAOptimizationsPass::run(MachineFunction &MF, return PreservedAnalyses::all(); } +void GCNPreRAOptimizationsImpl::hintTrue16Copy(const MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, Dst); + bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(DstRC); + if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() && + TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); + if (Src.isVirtual() && MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && + Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); + if (!Dst.isVirtual() || !Src.isVirtual()) + return; + if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && + MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { + MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); + MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); + } + if (IsDst16Bit && MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); +} + +bool GCNPreRAOptimizationsImpl::optimizeBVHStack(MachineInstr &MI) { + SmallVector UseRegs; + + // Find BVH sources for this DS_BVH_STACK instruction. + auto CheckUse = [&](MachineOperand &Use) { + Register Reg = Use.getReg(); + for (const MachineInstr &Src : MRI->def_instructions(Reg)) { + if (!SIInstrInfo::isImage(Src)) + continue; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Src.getOpcode()); + const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + if (!BaseInfo->BVH) + continue; + UseRegs.push_back(Reg); + break; + } + }; + CheckUse(*TII->getNamedOperand(MI, AMDGPU::OpName::data0)); + CheckUse(*TII->getNamedOperand(MI, AMDGPU::OpName::data1)); + + if (UseRegs.empty()) + return false; + + // Add implicit uses for entire BVH source registers. + // This avoids partial reallocation of register which could + // introduce a premature s_wait_bvhcnt. + for (Register Reg : UseRegs) { + MI.addOperand(MachineOperand::CreateReg(Reg, false, true)); + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + LLVM_DEBUG(dbgs() << "Added implicit uses to: " << MI); + + return true; +} + bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -258,34 +319,27 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { Changed |= processReg(Reg); } - if (!ST.useRealTrue16Insts()) + const bool HasBVHStack = ST.hasBVHDualAndBVH8Insts(); + const bool HasRealTrue16 = ST.useRealTrue16Insts(); + + if (!HasRealTrue16 && !HasBVHStack) return Changed; - // Add RA hints to improve True16 COPY elimination. - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (MI.getOpcode() != AMDGPU::COPY) + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // Add RA hints to improve True16 COPY elimination. + if (HasRealTrue16 && MI.getOpcode() == AMDGPU::COPY) { + hintTrue16Copy(MI); continue; - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, Dst); - bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(DstRC); - if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() && - TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); - if (Src.isVirtual() && - MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && - Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); - if (!Dst.isVirtual() || !Src.isVirtual()) + } + // Add implicit uses to avoid early wait on intersect ray instructions. + if (HasBVHStack && + (MI.getOpcode() == AMDGPU::DS_BVH_STACK_RTN_B32 || + MI.getOpcode() == AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32 || + MI.getOpcode() == AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64)) { + Changed |= optimizeBVHStack(MI); continue; - if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && - MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { - MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); - MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); } - if (IsDst16Bit && MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 21c088fafd91e..c2b33a35f50d5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7884,6 +7884,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, IID == Intrinsic::amdgcn_permlanex16; bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || IID == Intrinsic::amdgcn_set_inactive_chain_arg; + bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast || + IID == Intrinsic::amdgcn_permlane_up || + IID == Intrinsic::amdgcn_permlane_down || + IID == Intrinsic::amdgcn_permlane_xor; SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); const GCNSubtarget *ST = TLI.getSubtarget(); @@ -7905,6 +7909,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Operands.push_back(N->getOperand(4)); [[fallthrough]]; case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: Operands.push_back(Src2); [[fallthrough]]; case Intrinsic::amdgcn_readlane: @@ -7938,10 +7946,12 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || IID == Intrinsic::amdgcn_mov_dpp8 || - IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { + IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 || + IsPermlaneShuffle) { Src1 = N->getOperand(2); if (IID == Intrinsic::amdgcn_writelane || - IID == Intrinsic::amdgcn_update_dpp || IsPermLane16) + IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 || + IsPermlaneShuffle) Src2 = N->getOperand(3); } @@ -8036,18 +8046,21 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, DAG.getConstant(EltIdx, SL, MVT::i32)); if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || - IsPermLane16) + IsPermLane16) { Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, DAG.getConstant(EltIdx, SL, MVT::i32)); - if (IID == Intrinsic::amdgcn_writelane) + Pieces.push_back( + createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)); + } else if (IID == Intrinsic::amdgcn_writelane) { Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, DAG.getConstant(EltIdx, SL, MVT::i32)); + Pieces.push_back( + createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); + } else { + Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT)); + } - Pieces.push_back( - IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 - ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) - : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); EltIdx += SubVecNumElt; } return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); @@ -11173,6 +11186,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_set_inactive_chain_arg: case Intrinsic::amdgcn_mov_dpp8: case Intrinsic::amdgcn_update_dpp: + case Intrinsic::amdgcn_permlane_bcast: + case Intrinsic::amdgcn_permlane_up: + case Intrinsic::amdgcn_permlane_down: + case Intrinsic::amdgcn_permlane_xor: return lowerLaneOp(*this, Op.getNode(), DAG); case Intrinsic::amdgcn_dead: { SmallVector Poisons; @@ -15522,12 +15539,6 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, SDLoc(N), VT); } - if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || - N0.getOpcode() == ISD::SINT_TO_FP)) { - return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0, - N->getFlags()); - } - // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2f74bac039304..4d1c174c9f10d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5282,17 +5282,13 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } break; case AMDGPU::OPERAND_REG_IMM_INT32: - case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP32: - case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_BF16: case AMDGPU::OPERAND_REG_IMM_FP16: - case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2BF16: break; case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: @@ -5317,6 +5313,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } break; } + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() && + !isInlineConstant(MI, i) && + !AMDGPU::isValid32BitLiteral(MO.getImm(), + OpInfo.OperandType == + AMDGPU::OPERAND_REG_IMM_FP64)) { + ErrInfo = "illegal 64-bit immediate value for operand."; + return false; + } + break; case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: case AMDGPU::OPERAND_INPUT_MODS: if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 3a477cad4248c..1c6447de407f3 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1205,8 +1205,8 @@ class PermlaneVarPat; class PermlaneNoDppPat3Src : GCNPat< - (permlane i32:$src0, i32:$src1, i32:$src2), + Instruction inst, ValueType vt> : GCNPat< + (vt (permlane vt:$src0, i32:$src1, i32:$src2)), (inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2) >; @@ -1611,10 +1611,12 @@ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in { defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile>; } // End isConvergent = 1 - def : PermlaneNoDppPat3Src; - def : PermlaneNoDppPat3Src; - def : PermlaneNoDppPat3Src; - def : PermlaneNoDppPat3Src; + foreach vt = Reg32Types.types in { + def : PermlaneNoDppPat3Src; + def : PermlaneNoDppPat3Src; + def : PermlaneNoDppPat3Src; + def : PermlaneNoDppPat3Src; + } def : PermlaneNoDppPat2Src; } // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 68958a8cf32d5..d375a52884b80 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14814,75 +14814,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(Ptr); break; } - case PPC::LWAT_PSEUDO: - case PPC::LDAT_PSEUDO: { - DebugLoc DL = MI.getDebugLoc(); - Register DstReg = MI.getOperand(0).getReg(); - Register PtrReg = MI.getOperand(1).getReg(); - Register ValReg = MI.getOperand(2).getReg(); - unsigned FC = MI.getOperand(3).getImm(); - bool IsLwat = MI.getOpcode() == PPC::LWAT_PSEUDO; - Register Val64 = MRI.createVirtualRegister(&PPC::G8RCRegClass); - if (IsLwat) - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), Val64) - .addReg(ValReg) - .addImm(PPC::sub_32); - else - Val64 = ValReg; - - Register G8rPair = MRI.createVirtualRegister(&PPC::G8pRCRegClass); - Register UndefG8r = MRI.createVirtualRegister(&PPC::G8RCRegClass); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), UndefG8r); - BuildMI(*BB, MI, DL, TII->get(PPC::REG_SEQUENCE), G8rPair) - .addReg(UndefG8r) - .addImm(PPC::sub_gp8_x0) - .addReg(Val64) - .addImm(PPC::sub_gp8_x1); - - Register PairResult = MRI.createVirtualRegister(&PPC::G8pRCRegClass); - BuildMI(*BB, MI, DL, TII->get(IsLwat ? PPC::LWAT : PPC::LDAT), PairResult) - .addReg(G8rPair) - .addReg(PtrReg) - .addImm(FC); - Register Result64 = MRI.createVirtualRegister(&PPC::G8RCRegClass); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), Result64) - .addReg(PairResult, {}, PPC::sub_gp8_x0); - if (IsLwat) - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg) - .addReg(Result64, {}, PPC::sub_32); - else - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg) - .addReg(Result64); - break; - } - case PPC::LWAT_COND_PSEUDO: - case PPC::LDAT_COND_PSEUDO: { - DebugLoc DL = MI.getDebugLoc(); - Register DstReg = MI.getOperand(0).getReg(); - Register PtrReg = MI.getOperand(1).getReg(); - unsigned FC = MI.getOperand(2).getImm(); - bool IsLwat_Cond = MI.getOpcode() == PPC::LWAT_COND_PSEUDO; - - Register Pair = MRI.createVirtualRegister(&PPC::G8pRCRegClass); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Pair); - - Register PairResult = MRI.createVirtualRegister(&PPC::G8pRCRegClass); - BuildMI(*BB, MI, DL, TII->get(IsLwat_Cond ? PPC::LWAT : PPC::LDAT), - PairResult) - .addReg(Pair) - .addReg(PtrReg) - .addImm(FC); - Register Result64 = MRI.createVirtualRegister(&PPC::G8RCRegClass); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), Result64) - .addReg(PairResult, {}, PPC::sub_gp8_x0); - if (IsLwat_Cond) - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg) - .addReg(Result64, {}, PPC::sub_32); - else - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg) - .addReg(Result64); - break; - } default: llvm_unreachable("Unexpected instr type to insert"); } diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 3c130077f3988..294297645e166 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -348,19 +348,11 @@ def LDAT_CSNE : X_RD5_RS5_IM5<31, 614, (outs g8rc:$RST), (ins ptr_rc_nor0:$RA), Requires<[IsISA3_0]>; } -def LDAT_PSEUDO : PPCCustomInserterPseudo< - (outs g8rc:$dst), - (ins ptr_rc_nor0:$ptr, g8rc:$val, u5imm:$fc), - "#LDAT_PSEUDO", - [(set i64:$dst, (int_ppc_amo_ldat ptr_rc_nor0:$ptr, g8rc:$val, - u5imm_timm:$fc))]>; +def : Pat<(int_ppc_amo_ldat ptr_rc_nor0:$ptr, g8rc:$val, u5imm_timm:$fc), + (EVEN8 (LDAT (PAIR8 (i64 (IMPLICIT_DEF)), $val), $ptr, $fc))>; -def LDAT_COND_PSEUDO : PPCCustomInserterPseudo < - (outs g8rc:$dst), - (ins ptr_rc_nor0:$ptr, u5imm:$fc), - "#LDAT_COND_PSEUDO", - [(set i64:$dst, (int_ppc_amo_ldat_cond ptr_rc_nor0:$ptr, - u5imm_timm:$fc))]>; +def : Pat<(int_ppc_amo_ldat_cond ptr_rc_nor0:$ptr, u5imm_timm:$fc), + (EVEN8 (LDAT (PAIR8 (i64 (IMPLICIT_DEF)), (i64 (IMPLICIT_DEF))), $ptr, $fc))>; let Defs = [X8, X9, X10], Uses = [X9, X10] in def LDAT_CSNE_PSEUDO : PPCPostRAExpPseudo< diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index eb4099b532336..926a184246dd5 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -962,6 +962,24 @@ class BinOpWithoutSImm16Operand : def add_without_simm16 : BinOpWithoutSImm16Operand; def mul_without_simm16 : BinOpWithoutSImm16Operand; +//===----------------------------------------------------------------------===// +// Output pattern fragments. +// + +// Create an even/odd register pair. +def PAIR8 : OutPatFrag<(ops node:$even, node:$odd), + (REG_SEQUENCE G8pRC, $even, sub_gp8_x0, $odd, sub_gp8_x1)>; + +// Return the even part of an even/odd register pair. +def EVEN8 : OutPatFrag<(ops node:$pair), (EXTRACT_SUBREG $pair, sub_gp8_x0)>; + +// Any-extend a 32-bit value in GPRC to a 64-bit value in G8RC. +def AEXT8 : OutPatFrag<(ops node:$r), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $r, sub_32)>; + +// Truncate a 64-bit value in a G8RC value to 32-bit value in GPRC. +def TRUNC4 : OutPatFrag<(ops node:$r), (EXTRACT_SUBREG $r, sub_32)>; + //===----------------------------------------------------------------------===// // PowerPC Flag Definitions. @@ -2129,17 +2147,12 @@ def LWAT_CSNE : X_RD5_RS5_IM5<31, 582, (outs g8rc:$RST), (ins ptr_rc_nor0:$RA), "lwat $RST, $RA, 16", IIC_LdStLoad>, Requires<[IsISA3_0]>; -def LWAT_PSEUDO : PPCCustomInserterPseudo< - (outs gprc:$dst), - (ins ptr_rc_nor0:$ptr, gprc:$val, u5imm:$fc), - "#LWAT_PSEUDO", - [(set i32:$dst, (int_ppc_amo_lwat ptr_rc_nor0:$ptr, gprc:$val, u5imm_timm:$fc))]>; +def : Pat<(int_ppc_amo_lwat ptr_rc_nor0:$ptr, gprc:$val, u5imm_timm:$fc), + (TRUNC4 (LWAT (PAIR8 (i64 (IMPLICIT_DEF)), (AEXT8 $val)), $ptr, $fc))>; -def LWAT_COND_PSEUDO : PPCCustomInserterPseudo < - (outs gprc:$dst), - (ins ptr_rc_nor0:$ptr, u5imm:$fc), - "#LWAT_COND_PSEUDO", - [(set i32:$dst, (int_ppc_amo_lwat_cond ptr_rc_nor0:$ptr, u5imm_timm:$fc))]>; +def : Pat<(int_ppc_amo_lwat_cond ptr_rc_nor0:$ptr, u5imm_timm:$fc), + (TRUNC4 (LWAT (PAIR8 (i64 (IMPLICIT_DEF)), (i64 (IMPLICIT_DEF))), + $ptr, $fc))>; let Defs = [R8, R9, R10], Uses = [R9, R10] in def LWAT_CSNE_PSEUDO : PPCPostRAExpPseudo< diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index aa5b864df5936..f41f4b8784dd0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -321,6 +321,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } if (Subtarget.hasAVX10_2()) { + for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v32i8}) { + setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); + } setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal); @@ -1166,7 +1170,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // SSE2 can use basic vector unrolling. // SSE41 can use PHMINPOS to perform v16i8/v8i16 minmax reductions. - for (auto VT : {MVT::v16i8, MVT::v8i16}) { + // Fallback to ReplaceNodeResults for vXi64 reductions on 32-bit targets. + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::i64}) { setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); @@ -1423,15 +1428,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } - // Allow v4i32/v2i64 minmax reductions with SSE41 vector comparison, - // select and minmax handling. - for (auto VT : { MVT::v4i32, MVT::v2i64 }) { - setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); - } - // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); @@ -22785,13 +22781,21 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); EVT TmpVT = DstVT; + EVT SatVT = cast(Node->getOperand(1))->getVT(); + + if (Subtarget.hasAVX10_2() && SrcVT.isVector() && + SrcVT.getVectorElementType() == MVT::bf16 && SatVT == MVT::i8) { + MVT VecI16VT = SrcVT.getSimpleVT().changeVectorElementType(MVT::i16); + SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2IBS : X86ISD::CVTTP2IUBS, + dl, VecI16VT, Src); + return DAG.getNode(ISD::TRUNCATE, dl, DstVT, Res); + } // This code is only for floats and doubles. Fall back to generic code for // anything else. if (!isScalarFPTypeInSSEReg(SrcVT) || isBF16orSoftF16(SrcVT, Subtarget)) return SDValue(); - EVT SatVT = cast(Node->getOperand(1))->getVT(); unsigned SatWidth = SatVT.getScalarSizeInBits(); unsigned DstWidth = DstVT.getScalarSizeInBits(); unsigned TmpWidth = TmpVT.getScalarSizeInBits(); @@ -29680,6 +29684,7 @@ static SDValue LowerVECREDUCE(SDValue Op, const X86Subtarget &Subtarget, SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getScalarType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(Op); if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) @@ -29697,6 +29702,14 @@ static SDValue LowerVECREDUCE(SDValue Op, const X86Subtarget &Subtarget, // Expand 128-bit shuffle tree + reduction binops. unsigned NumSrcElts = SrcVT.getVectorNumElements(); for (unsigned NumElts = NumSrcElts; NumElts != 1; NumElts /= 2) { + // Scalarize the last 2 elements if the vector binop isn't legal. + if (NumElts == 2 && !Subtarget.hasAVX512() && + !TLI.isOperationLegal(BinOp, SrcVT) && TLI.isTypeLegal(ExtractVT)) { + return DAG.getNode(BinOp, DL, ExtractVT, + DAG.getExtractVectorElt(DL, ExtractVT, Src, 0), + DAG.getExtractVectorElt(DL, ExtractVT, Src, 1)); + } + SmallVector Mask(NumSrcElts, -1); std::iota(Mask.begin(), Mask.begin() + (NumElts / 2), NumElts / 2); SDValue Upper = @@ -35334,6 +35347,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected vector reduction"); + if (SDValue Res = LowerMINMAX_REDUCE(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(Res); + return; + } case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: { if (!Subtarget.hasAVX10_2()) @@ -35343,8 +35365,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); EVT OpVT = Op.getValueType(); + EVT SatVT = cast(N->getOperand(1))->getVT(); SDValue Res; + if (VT == MVT::v8i8 && OpVT == MVT::v8bf16 && SatVT == MVT::i8) { + Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2IBS : X86ISD::CVTTP2IUBS, dl, + MVT::v8i16, Op); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + Results.push_back(Res); + return; + } + if (VT == MVT::v2i32 && OpVT == MVT::v2f64) { if (IsSigned) Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op); @@ -47106,10 +47137,6 @@ static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, // ISD::VECREDUCE_SMIN/SMAX/UMIN/UMAX. static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT ExtractVT = Extract->getValueType(0); - if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) - return SDValue(); - // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. ISD::NodeType BinOp; SDValue Src = DAG.matchBinOpReduction( @@ -47139,7 +47166,7 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, llvm_unreachable("Unexpected reduction"); } - return DAG.getNode(RdxOp, SDLoc(Extract), ExtractVT, Src); + return DAG.getNode(RdxOp, SDLoc(Extract), Extract->getValueType(0), Src); } // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 00d378adf9c5d..7213d4ae795ec 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5523,6 +5523,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (VF.isScalable()) return InstructionCost::getInvalid(); return TTI.getArithmeticInstrCost(Instruction::Mul, RetTy, Config.CostKind); + case Instruction::Freeze: + return TTI::TCC_Free; default: // This opcode is unknown. Assume that it is the same as 'mul'. return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 27569f9966de0..397366bae9af9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1000,9 +1000,12 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( RHSInfo, Operands, CtxI, &Ctx.TLI); } case Instruction::Freeze: - // This opcode is unknown. Assume that it is the same as 'mul'. - return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy, - Ctx.CostKind); + // NOTE: The only way to ask for the cost is via getInstructionCost, which + // requires the actual vector instruction. Instead, both here and in the + // LoopVectorizationCostModel::getInstructionCost the costs mirror the + // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep + // them in sync. + return TTI::TCC_Free; case Instruction::ExtractValue: return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, Ctx.CostKind); diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 68663ae820b57..9c57f1f2e5367 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -802,28 +802,28 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i ret void } -; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.bcast.i32(i32 %src0, i32 %src1, i32 %src2) define amdgpu_kernel void @v_permlane_bcast_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %result= call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) store i32 %result, ptr addrspace(1) %out ret void } -; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.up.i32(i32 %src0, i32 %src1, i32 %src2) define amdgpu_kernel void @v_permlane_up_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %result= call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) store i32 %result, ptr addrspace(1) %out ret void } -; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.down.i32(i32 %src0, i32 %src1, i32 %src2) define amdgpu_kernel void @v_permlane_down_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %result= call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) store i32 %result, ptr addrspace(1) %out ret void } -; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) +; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.xor.i32(i32 %src0, i32 %src1, i32 %src2) define amdgpu_kernel void @v_permlane_xor_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %result= call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/Assembler/thinlto-bad-summary1.ll b/llvm/test/Assembler/thinlto-bad-summary1.ll index 8ff5e06b189a6..900ad73cc8435 100644 --- a/llvm/test/Assembler/thinlto-bad-summary1.ll +++ b/llvm/test/Assembler/thinlto-bad-summary1.ll @@ -2,7 +2,7 @@ ; summary type label. ; RUN: not opt %s 2>&1 | FileCheck %s -; CHECK: error: Expected 'gv', 'module', 'typeid', 'flags' or 'blockcount' at the start of summary entry +; CHECK: error: Expected 'gv', 'module', 'typeid', 'typeidCompatibleVTable', 'flags' or 'blockcount' at the start of summary entry ; ModuleID = 'thinlto-function-summary-callgraph.ll' target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Assembler/thinlto-vtable-skip.ll b/llvm/test/Assembler/thinlto-vtable-skip.ll new file mode 100644 index 0000000000000..b7e71251bc44d --- /dev/null +++ b/llvm/test/Assembler/thinlto-vtable-skip.ll @@ -0,0 +1,11 @@ +; Disabling output means we'll just skip the summary entries, which is the code +; path we're trying to test. There's no output to check against, so we have no +; CHECKs. +; +; RUN: opt %s -disable-output + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +^0 = module: (path: "thinlto-vtable-skip.ll", hash: (0, 0, 0, 0, 0)) +^1 = typeidCompatibleVTable: (name: "_ZTS1A", summary: ((offset: 16, ^0))) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 09c2e481b0433..6eb7cf35c2a42 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -201,88 +201,88 @@ entry: define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) { ; CHECK-LABEL: abp90c12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 ; CHECK-NEXT: ldr s17, [sp, #32] -; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: mov v0.s[1], v2.s[0] ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 -; CHECK-NEXT: add x11, sp, #72 -; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: add x11, sp, #64 +; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 +; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 +; CHECK-NEXT: ldr s3, [sp, #96] +; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: ldr s16, [sp, #8] ; CHECK-NEXT: ldr s18, [x10] -; CHECK-NEXT: add x9, sp, #80 ; CHECK-NEXT: add x10, sp, #56 -; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 ; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 -; CHECK-NEXT: ldr s16, [sp, #8] -; CHECK-NEXT: ldr s3, [sp, #96] -; CHECK-NEXT: ld1 { v18.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #88 ; CHECK-NEXT: ldr s2, [sp] +; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: ldr s4, [x11] ; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: add x11, sp, #80 ; CHECK-NEXT: ldr s5, [sp, #40] -; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: ld1 { v4.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #88 +; CHECK-NEXT: add x9, sp, #168 ; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: ldr s19, [x11] -; CHECK-NEXT: add x10, sp, #144 -; CHECK-NEXT: zip1 v4.2d, v17.2d, v18.2d -; CHECK-NEXT: add x11, sp, #160 -; CHECK-NEXT: ldr s18, [sp, #136] -; CHECK-NEXT: ld1 { v19.s }[1], [x9] +; CHECK-NEXT: ld1 { v18.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #112 ; CHECK-NEXT: mov v0.s[3], v6.s[0] -; CHECK-NEXT: ldr s6, [sp, #128] ; CHECK-NEXT: mov v1.s[3], v7.s[0] -; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: ldr s7, [sp, #104] -; CHECK-NEXT: ld1 { v16.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: ld1 { v6.s }[1], [x10] -; CHECK-NEXT: zip1 v5.2d, v5.2d, v19.2d -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: add x10, sp, #24 +; CHECK-NEXT: ld1 { v3.s }[1], [x11] +; CHECK-NEXT: zip1 v4.2d, v17.2d, v4.2d +; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: zip1 v6.2d, v5.2d, v18.2d +; CHECK-NEXT: ldr s5, [sp, #104] +; CHECK-NEXT: ld1 { v16.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ldr s7, [sp, #128] +; CHECK-NEXT: ldr s18, [sp, #192] +; CHECK-NEXT: ld1 { v5.s }[1], [x11] +; CHECK-NEXT: ldr s17, [x10] +; CHECK-NEXT: add x10, sp, #144 +; CHECK-NEXT: add x11, sp, #176 ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ldr s17, [x11] -; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: add x10, sp, #16 -; CHECK-NEXT: add x11, sp, #168 -; CHECK-NEXT: ld1 { v17.s }[1], [x9] -; CHECK-NEXT: ld1 { v2.s }[1], [x10] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: fmul v19.4s, v5.4s, v1.4s -; CHECK-NEXT: fmul v20.4s, v7.4s, v16.4s -; CHECK-NEXT: fmul v16.4s, v3.4s, v16.4s +; CHECK-NEXT: ldr s21, [x9] +; CHECK-NEXT: ld1 { v17.s }[1], [x11] +; CHECK-NEXT: fmul v19.4s, v6.4s, v1.4s ; CHECK-NEXT: fmul v1.4s, v4.4s, v1.4s +; CHECK-NEXT: fmul v20.4s, v5.4s, v16.4s +; CHECK-NEXT: fmul v16.4s, v3.4s, v16.4s +; CHECK-NEXT: add x9, sp, #208 +; CHECK-NEXT: add x10, sp, #152 +; CHECK-NEXT: add x11, sp, #184 ; CHECK-NEXT: ld1 { v18.s }[1], [x9] -; CHECK-NEXT: ldr s21, [x11] -; CHECK-NEXT: zip1 v6.2d, v6.2d, v17.2d -; CHECK-NEXT: ldr s17, [sp, #192] -; CHECK-NEXT: add x9, sp, #184 -; CHECK-NEXT: add x10, sp, #208 -; CHECK-NEXT: ld1 { v21.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #216 +; CHECK-NEXT: zip1 v7.2d, v7.2d, v17.2d +; CHECK-NEXT: ldr s17, [sp, #136] +; CHECK-NEXT: ld1 { v21.s }[1], [x11] ; CHECK-NEXT: fneg v19.4s, v19.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v6.4s +; CHECK-NEXT: add x9, sp, #216 ; CHECK-NEXT: fneg v20.4s, v20.4s -; CHECK-NEXT: fmla v16.4s, v2.4s, v7.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v5.4s +; CHECK-NEXT: fmla v16.4s, v2.4s, v5.4s ; CHECK-NEXT: ld1 { v17.s }[1], [x10] ; CHECK-NEXT: ldr s5, [sp, #200] -; CHECK-NEXT: zip1 v7.2d, v18.2d, v21.2d -; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: zip1 v6.2d, v17.2d, v21.2d ; CHECK-NEXT: fmla v19.4s, v0.4s, v4.4s +; CHECK-NEXT: fsub v0.4s, v7.4s, v1.4s ; CHECK-NEXT: fmla v20.4s, v2.4s, v3.4s -; CHECK-NEXT: fsub v0.4s, v6.4s, v1.4s -; CHECK-NEXT: fsub v1.4s, v17.4s, v16.4s -; CHECK-NEXT: fadd v2.4s, v7.4s, v19.4s +; CHECK-NEXT: fsub v1.4s, v18.4s, v16.4s +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: fadd v2.4s, v6.4s, v19.4s ; CHECK-NEXT: fadd v3.4s, v5.4s, v20.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 -; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 +; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 ; CHECK-NEXT: rev64 v4.4s, v4.4s ; CHECK-NEXT: trn2 v3.4s, v4.4s, v5.4s diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index 385ec6710185b..884f786d1b973 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -180,14 +180,23 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) { } define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) { -; CHECK-LABEL: concat_v16s8_v4s8_load: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ld1 { v0.s }[1], [x1] -; CHECK-NEXT: ldr s1, [x2] -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: concat_v16s8_v4s8_load: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr s0, [x2] +; CHECK-SD-NEXT: ldr s1, [x0] +; CHECK-SD-NEXT: ld1 { v0.s }[1], [x3] +; CHECK-SD-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-SD-NEXT: zip1 v0.2d, v1.2d, v0.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: concat_v16s8_v4s8_load: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr s0, [x0] +; CHECK-GI-NEXT: ldr s1, [x2] +; CHECK-GI-NEXT: ld1 { v0.s }[1], [x1] +; CHECK-GI-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-GI-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret %A = load <4 x i8>, ptr %ptrA %B = load <4 x i8>, ptr %ptrB %C = load <4 x i8>, ptr %ptrC diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index a3a09839c54c4..a457fe01d309d 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -1683,42 +1683,42 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; CHECK-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-FP16-NEXT: // kill: def $h1 killed $h1 def $q1 ; CHECK-FP16-NEXT: // kill: def $h2 killed $h2 def $q2 -; CHECK-FP16-NEXT: add x9, sp, #16 ; CHECK-FP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; CHECK-FP16-NEXT: // kill: def $h4 killed $h4 def $q4 -; CHECK-FP16-NEXT: add x10, sp, #40 +; CHECK-FP16-NEXT: add x9, sp, #40 +; CHECK-FP16-NEXT: add x10, sp, #48 ; CHECK-FP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; CHECK-FP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; CHECK-FP16-NEXT: // kill: def $h7 killed $h7 def $q7 ; CHECK-FP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-FP16-NEXT: ldr h1, [sp, #8] +; CHECK-FP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP16-NEXT: ldr h2, [x9] +; CHECK-FP16-NEXT: add x9, sp, #16 ; CHECK-FP16-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-FP16-NEXT: add x9, sp, #24 -; CHECK-FP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP16-NEXT: ld1 { v2.h }[1], [x10] +; CHECK-FP16-NEXT: add x10, sp, #56 +; CHECK-FP16-NEXT: mov v0.h[3], v3.h[0] ; CHECK-FP16-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-FP16-NEXT: add x9, sp, #32 -; CHECK-FP16-NEXT: mov v0.h[3], v3.h[0] -; CHECK-FP16-NEXT: ld1 { v1.h }[3], [x9] -; CHECK-FP16-NEXT: ldr h2, [x10] -; CHECK-FP16-NEXT: add x9, sp, #48 +; CHECK-FP16-NEXT: ld1 { v2.h }[2], [x10] +; CHECK-FP16-NEXT: add x10, sp, #64 ; CHECK-FP16-NEXT: ldr h3, [sp, #72] -; CHECK-FP16-NEXT: ld1 { v2.h }[1], [x9] -; CHECK-FP16-NEXT: add x9, sp, #56 +; CHECK-FP16-NEXT: ld1 { v1.h }[3], [x9] ; CHECK-FP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; CHECK-FP16-NEXT: mov v0.h[4], v4.h[0] -; CHECK-FP16-NEXT: ld1 { v2.h }[2], [x9] -; CHECK-FP16-NEXT: add x9, sp, #64 -; CHECK-FP16-NEXT: mov v0.h[5], v5.h[0] -; CHECK-FP16-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-FP16-NEXT: ld1 { v2.h }[3], [x10] ; CHECK-FP16-NEXT: zip1 v1.2d, v1.2d, v2.2d ; CHECK-FP16-NEXT: ldr h2, [sp] -; CHECK-FP16-NEXT: mov v0.h[6], v6.h[0] +; CHECK-FP16-NEXT: mov v0.h[5], v5.h[0] ; CHECK-FP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; CHECK-FP16-NEXT: fminnm v1.8h, v1.8h, v1.8h -; CHECK-FP16-NEXT: mov v0.h[7], v7.h[0] +; CHECK-FP16-NEXT: mov v0.h[6], v6.h[0] ; CHECK-FP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h -; CHECK-FP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; CHECK-FP16-NEXT: mov v0.h[7], v7.h[0] ; CHECK-FP16-NEXT: str h2, [x8, #16] +; CHECK-FP16-NEXT: fminnm v0.8h, v0.8h, v0.8h ; CHECK-FP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h ; CHECK-FP16-NEXT: str q0, [x8] ; CHECK-FP16-NEXT: ret @@ -2326,42 +2326,42 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; CHECK-FP16-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-FP16-NEXT: // kill: def $h1 killed $h1 def $q1 ; CHECK-FP16-NEXT: // kill: def $h2 killed $h2 def $q2 -; CHECK-FP16-NEXT: add x9, sp, #16 ; CHECK-FP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; CHECK-FP16-NEXT: // kill: def $h4 killed $h4 def $q4 -; CHECK-FP16-NEXT: add x10, sp, #40 +; CHECK-FP16-NEXT: add x9, sp, #40 +; CHECK-FP16-NEXT: add x10, sp, #48 ; CHECK-FP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; CHECK-FP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; CHECK-FP16-NEXT: // kill: def $h7 killed $h7 def $q7 ; CHECK-FP16-NEXT: mov v0.h[1], v1.h[0] ; CHECK-FP16-NEXT: ldr h1, [sp, #8] +; CHECK-FP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP16-NEXT: ldr h2, [x9] +; CHECK-FP16-NEXT: add x9, sp, #16 ; CHECK-FP16-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-FP16-NEXT: add x9, sp, #24 -; CHECK-FP16-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP16-NEXT: ld1 { v2.h }[1], [x10] +; CHECK-FP16-NEXT: add x10, sp, #56 +; CHECK-FP16-NEXT: mov v0.h[3], v3.h[0] ; CHECK-FP16-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-FP16-NEXT: add x9, sp, #32 -; CHECK-FP16-NEXT: mov v0.h[3], v3.h[0] -; CHECK-FP16-NEXT: ld1 { v1.h }[3], [x9] -; CHECK-FP16-NEXT: ldr h2, [x10] -; CHECK-FP16-NEXT: add x9, sp, #48 +; CHECK-FP16-NEXT: ld1 { v2.h }[2], [x10] +; CHECK-FP16-NEXT: add x10, sp, #64 ; CHECK-FP16-NEXT: ldr h3, [sp, #72] -; CHECK-FP16-NEXT: ld1 { v2.h }[1], [x9] -; CHECK-FP16-NEXT: add x9, sp, #56 +; CHECK-FP16-NEXT: ld1 { v1.h }[3], [x9] ; CHECK-FP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; CHECK-FP16-NEXT: mov v0.h[4], v4.h[0] -; CHECK-FP16-NEXT: ld1 { v2.h }[2], [x9] -; CHECK-FP16-NEXT: add x9, sp, #64 -; CHECK-FP16-NEXT: mov v0.h[5], v5.h[0] -; CHECK-FP16-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-FP16-NEXT: ld1 { v2.h }[3], [x10] ; CHECK-FP16-NEXT: zip1 v1.2d, v1.2d, v2.2d ; CHECK-FP16-NEXT: ldr h2, [sp] -; CHECK-FP16-NEXT: mov v0.h[6], v6.h[0] +; CHECK-FP16-NEXT: mov v0.h[5], v5.h[0] ; CHECK-FP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; CHECK-FP16-NEXT: fminnm v1.8h, v1.8h, v1.8h -; CHECK-FP16-NEXT: mov v0.h[7], v7.h[0] +; CHECK-FP16-NEXT: mov v0.h[6], v6.h[0] ; CHECK-FP16-NEXT: fminnm v2.8h, v2.8h, v3.8h -; CHECK-FP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; CHECK-FP16-NEXT: mov v0.h[7], v7.h[0] ; CHECK-FP16-NEXT: str h2, [x8, #16] +; CHECK-FP16-NEXT: fminnm v0.8h, v0.8h, v0.8h ; CHECK-FP16-NEXT: fminnm v0.8h, v0.8h, v1.8h ; CHECK-FP16-NEXT: str q0, [x8] ; CHECK-FP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 89109ba653f57..6a36a0f3fe5fd 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -2396,64 +2396,64 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) { ; ; CHECK-GI-LABEL: fshl_v7i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr s5, [sp, #80] -; CHECK-GI-NEXT: ldr s16, [sp, #88] -; CHECK-GI-NEXT: fmov s19, w0 ; CHECK-GI-NEXT: ldr s1, [sp, #48] ; CHECK-GI-NEXT: ldr s7, [sp, #56] +; CHECK-GI-NEXT: add x9, sp, #64 +; CHECK-GI-NEXT: ldr s5, [sp, #80] +; CHECK-GI-NEXT: ldr s16, [sp, #88] ; CHECK-GI-NEXT: add x8, sp, #56 +; CHECK-GI-NEXT: mov v1.s[1], v7.s[0] +; CHECK-GI-NEXT: ldr s6, [sp] +; CHECK-GI-NEXT: ldr s7, [sp, #64] ; CHECK-GI-NEXT: mov v5.s[1], v16.s[0] ; CHECK-GI-NEXT: fmov s16, w7 -; CHECK-GI-NEXT: ldr s6, [sp] -; CHECK-GI-NEXT: mov v1.s[1], v7.s[0] -; CHECK-GI-NEXT: ldr s17, [sp, #64] -; CHECK-GI-NEXT: ldr s7, [sp, #48] +; CHECK-GI-NEXT: ldr s17, [sp, #48] +; CHECK-GI-NEXT: ldr s18, [x9] +; CHECK-GI-NEXT: add x10, sp, #72 +; CHECK-GI-NEXT: ldr s19, [sp, #96] +; CHECK-GI-NEXT: ld1 { v17.s }[1], [x8] ; CHECK-GI-NEXT: ldr s4, [sp, #8] ; CHECK-GI-NEXT: ldr s0, [sp, #24] -; CHECK-GI-NEXT: ldr s3, [sp, #32] ; CHECK-GI-NEXT: mov v16.s[1], v6.s[0] -; CHECK-GI-NEXT: ldr s6, [sp, #96] -; CHECK-GI-NEXT: add x9, sp, #64 -; CHECK-GI-NEXT: ld1 { v7.s }[1], [x8] -; CHECK-GI-NEXT: ldr s18, [x9] -; CHECK-GI-NEXT: mov v19.s[1], w1 -; CHECK-GI-NEXT: mov v5.s[2], v6.s[0] -; CHECK-GI-NEXT: movi v6.2d, #0xffffffffffffffff -; CHECK-GI-NEXT: mov v1.s[2], v17.s[0] -; CHECK-GI-NEXT: ldr s17, [sp, #72] -; CHECK-GI-NEXT: add x8, sp, #72 +; CHECK-GI-NEXT: mov v1.s[2], v7.s[0] +; CHECK-GI-NEXT: fmov s7, w0 +; CHECK-GI-NEXT: ld1 { v18.s }[1], [x10] +; CHECK-GI-NEXT: ldr s3, [sp, #32] +; CHECK-GI-NEXT: ldr s6, [sp, #72] +; CHECK-GI-NEXT: mov v5.s[2], v19.s[0] +; CHECK-GI-NEXT: movi v19.2d, #0xffffffffffffffff ; CHECK-GI-NEXT: ldr s20, [sp, #80] -; CHECK-GI-NEXT: mov v16.s[2], v4.s[0] +; CHECK-GI-NEXT: mov v7.s[1], w1 ; CHECK-GI-NEXT: mov v0.s[1], v3.s[0] -; CHECK-GI-NEXT: ld1 { v18.s }[1], [x8] ; CHECK-GI-NEXT: add x8, sp, #88 +; CHECK-GI-NEXT: mov v16.s[2], v4.s[0] +; CHECK-GI-NEXT: mov v1.s[3], v6.s[0] +; CHECK-GI-NEXT: zip1 v6.2d, v17.2d, v18.2d +; CHECK-GI-NEXT: fmov s17, w4 ; CHECK-GI-NEXT: movi v4.4s, #31 ; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: eor v5.16b, v5.16b, v6.16b -; CHECK-GI-NEXT: fmov s6, w4 -; CHECK-GI-NEXT: mov v1.s[3], v17.s[0] ; CHECK-GI-NEXT: ldr s3, [sp, #40] ; CHECK-GI-NEXT: ld1 { v20.s }[1], [x8] -; CHECK-GI-NEXT: mov v19.s[2], w2 -; CHECK-GI-NEXT: zip1 v7.2d, v7.2d, v18.2d -; CHECK-GI-NEXT: mov v16.s[3], v2.s[0] +; CHECK-GI-NEXT: eor v5.16b, v5.16b, v19.16b +; CHECK-GI-NEXT: mov v7.s[2], w2 ; CHECK-GI-NEXT: add x8, sp, #96 -; CHECK-GI-NEXT: mov v6.s[1], w5 +; CHECK-GI-NEXT: mov v17.s[1], w5 +; CHECK-GI-NEXT: mov v16.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v0.s[2], v3.s[0] -; CHECK-GI-NEXT: and v2.16b, v5.16b, v4.16b ; CHECK-GI-NEXT: bic v1.16b, v4.16b, v1.16b ; CHECK-GI-NEXT: ld1 { v20.s }[2], [x8] -; CHECK-GI-NEXT: mov v19.s[3], w3 -; CHECK-GI-NEXT: and v3.16b, v7.16b, v4.16b +; CHECK-GI-NEXT: and v2.16b, v5.16b, v4.16b +; CHECK-GI-NEXT: and v3.16b, v6.16b, v4.16b +; CHECK-GI-NEXT: mov v7.s[3], w3 +; CHECK-GI-NEXT: mov v17.s[2], w6 ; CHECK-GI-NEXT: ushr v5.4s, v16.4s, #1 -; CHECK-GI-NEXT: neg v2.4s, v2.4s -; CHECK-GI-NEXT: mov v6.s[2], w6 ; CHECK-GI-NEXT: neg v1.4s, v1.4s ; CHECK-GI-NEXT: and v4.16b, v20.16b, v4.16b ; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #1 -; CHECK-GI-NEXT: ushl v3.4s, v19.4s, v3.4s +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: ushl v3.4s, v7.4s, v3.4s ; CHECK-GI-NEXT: ushl v1.4s, v5.4s, v1.4s -; CHECK-GI-NEXT: ushl v4.4s, v6.4s, v4.4s +; CHECK-GI-NEXT: ushl v4.4s, v17.4s, v4.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: orr v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v4.16b, v0.16b @@ -2536,62 +2536,62 @@ define <7 x i32> @fshr_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) { ; CHECK-GI-LABEL: fshr_v7i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr s1, [sp, #48] -; CHECK-GI-NEXT: ldr s16, [sp, #56] -; CHECK-GI-NEXT: add x8, sp, #56 -; CHECK-GI-NEXT: ldr s17, [sp, #80] -; CHECK-GI-NEXT: ldr s18, [sp, #88] +; CHECK-GI-NEXT: ldr s17, [sp, #56] ; CHECK-GI-NEXT: add x9, sp, #64 -; CHECK-GI-NEXT: mov v1.s[1], v16.s[0] -; CHECK-GI-NEXT: fmov s16, w0 -; CHECK-GI-NEXT: ldr s5, [sp, #48] -; CHECK-GI-NEXT: mov v17.s[1], v18.s[0] -; CHECK-GI-NEXT: fmov s18, w7 +; CHECK-GI-NEXT: ldr s18, [sp, #80] +; CHECK-GI-NEXT: ldr s19, [sp, #88] +; CHECK-GI-NEXT: add x8, sp, #56 +; CHECK-GI-NEXT: mov v1.s[1], v17.s[0] +; CHECK-GI-NEXT: fmov s17, w0 ; CHECK-GI-NEXT: ldr s4, [sp] -; CHECK-GI-NEXT: ld1 { v5.s }[1], [x8] -; CHECK-GI-NEXT: ldr s6, [x9] +; CHECK-GI-NEXT: mov v18.s[1], v19.s[0] +; CHECK-GI-NEXT: fmov s19, w7 +; CHECK-GI-NEXT: ldr s7, [sp, #48] +; CHECK-GI-NEXT: ldr s16, [x9] ; CHECK-GI-NEXT: add x10, sp, #72 -; CHECK-GI-NEXT: mov v16.s[1], w1 ; CHECK-GI-NEXT: ldr s20, [sp, #80] -; CHECK-GI-NEXT: ldr s7, [sp, #64] -; CHECK-GI-NEXT: mov v18.s[1], v4.s[0] +; CHECK-GI-NEXT: mov v17.s[1], w1 +; CHECK-GI-NEXT: ldr s6, [sp, #64] +; CHECK-GI-NEXT: add x9, sp, #88 +; CHECK-GI-NEXT: mov v19.s[1], v4.s[0] ; CHECK-GI-NEXT: fmov s4, w4 -; CHECK-GI-NEXT: add x8, sp, #88 -; CHECK-GI-NEXT: ld1 { v6.s }[1], [x10] +; CHECK-GI-NEXT: ld1 { v7.s }[1], [x8] +; CHECK-GI-NEXT: ld1 { v16.s }[1], [x10] ; CHECK-GI-NEXT: ldr s21, [sp, #96] -; CHECK-GI-NEXT: ld1 { v20.s }[1], [x8] -; CHECK-GI-NEXT: mov v1.s[2], v7.s[0] +; CHECK-GI-NEXT: ld1 { v20.s }[1], [x9] +; CHECK-GI-NEXT: mov v1.s[2], v6.s[0] ; CHECK-GI-NEXT: ldr s3, [sp, #8] ; CHECK-GI-NEXT: ldr s0, [sp, #24] -; CHECK-GI-NEXT: mov v16.s[2], w2 +; CHECK-GI-NEXT: mov v17.s[2], w2 ; CHECK-GI-NEXT: mov v4.s[1], w5 ; CHECK-GI-NEXT: ldr s2, [sp, #32] -; CHECK-GI-NEXT: zip1 v5.2d, v5.2d, v6.2d -; CHECK-GI-NEXT: movi v6.4s, #31 +; CHECK-GI-NEXT: zip1 v6.2d, v7.2d, v16.2d +; CHECK-GI-NEXT: movi v7.4s, #31 ; CHECK-GI-NEXT: add x8, sp, #96 -; CHECK-GI-NEXT: mov v17.s[2], v21.s[0] -; CHECK-GI-NEXT: movi v7.2d, #0xffffffffffffffff -; CHECK-GI-NEXT: ldr s19, [sp, #72] +; CHECK-GI-NEXT: mov v18.s[2], v21.s[0] +; CHECK-GI-NEXT: movi v16.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: ldr s5, [sp, #72] ; CHECK-GI-NEXT: ld1 { v20.s }[2], [x8] -; CHECK-GI-NEXT: mov v18.s[2], v3.s[0] +; CHECK-GI-NEXT: mov v19.s[2], v3.s[0] ; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: mov v1.s[3], v19.s[0] -; CHECK-GI-NEXT: mov v16.s[3], w3 +; CHECK-GI-NEXT: mov v1.s[3], v5.s[0] +; CHECK-GI-NEXT: mov v17.s[3], w3 ; CHECK-GI-NEXT: mov v4.s[2], w6 ; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: and v3.16b, v5.16b, v6.16b +; CHECK-GI-NEXT: and v3.16b, v6.16b, v7.16b ; CHECK-GI-NEXT: ldr s5, [sp, #40] -; CHECK-GI-NEXT: and v19.16b, v20.16b, v6.16b -; CHECK-GI-NEXT: eor v7.16b, v17.16b, v7.16b -; CHECK-GI-NEXT: mov v18.s[3], v2.s[0] +; CHECK-GI-NEXT: and v6.16b, v20.16b, v7.16b +; CHECK-GI-NEXT: eor v16.16b, v18.16b, v16.16b +; CHECK-GI-NEXT: mov v19.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v0.s[2], v5.s[0] -; CHECK-GI-NEXT: bic v1.16b, v6.16b, v1.16b -; CHECK-GI-NEXT: shl v2.4s, v16.4s, #1 +; CHECK-GI-NEXT: bic v1.16b, v7.16b, v1.16b +; CHECK-GI-NEXT: shl v2.4s, v17.4s, #1 ; CHECK-GI-NEXT: neg v3.4s, v3.4s -; CHECK-GI-NEXT: and v5.16b, v7.16b, v6.16b +; CHECK-GI-NEXT: and v5.16b, v16.16b, v7.16b ; CHECK-GI-NEXT: shl v4.4s, v4.4s, #1 -; CHECK-GI-NEXT: neg v6.4s, v19.4s +; CHECK-GI-NEXT: neg v6.4s, v6.4s ; CHECK-GI-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-GI-NEXT: ushl v2.4s, v18.4s, v3.4s +; CHECK-GI-NEXT: ushl v2.4s, v19.4s, v3.4s ; CHECK-GI-NEXT: ushl v3.4s, v4.4s, v5.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v6.4s ; CHECK-GI-NEXT: orr v1.16b, v1.16b, v2.16b diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll index 7fd4246cd4975..12534a1c0114a 100644 --- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll +++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll @@ -697,17 +697,17 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; CHECK-NEXT: mov s0, v0.s[3] ; CHECK-NEXT: str q1, [sp] // 16-byte Spill ; CHECK-NEXT: bl frexpf +; CHECK-NEXT: ldr q3, [sp] // 16-byte Reload ; CHECK-NEXT: ldr s1, [sp, #44] -; CHECK-NEXT: ldr q2, [sp] // 16-byte Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov v2.s[3], v0.s[0] +; CHECK-NEXT: ldr s2, [x20] +; CHECK-NEXT: mov v3.s[3], v0.s[0] ; CHECK-NEXT: ld1 { v1.s }[1], [x19] -; CHECK-NEXT: ldr s0, [x20] -; CHECK-NEXT: ld1 { v0.s }[1], [x21] +; CHECK-NEXT: ld1 { v2.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: zip1 v1.2d, v1.2d, v0.2d -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: zip1 v1.2d, v1.2d, v2.2d +; CHECK-NEXT: mov v0.16b, v3.16b ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret ; @@ -872,8 +872,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; CHECK-NEXT: mov s0, v0.s[3] ; CHECK-NEXT: bl frexpf ; CHECK-NEXT: ldr s0, [sp, #28] -; CHECK-NEXT: ld1 { v0.s }[1], [x19] ; CHECK-NEXT: ldr s1, [x20] +; CHECK-NEXT: ld1 { v0.s }[1], [x19] ; CHECK-NEXT: ld1 { v1.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index ff9c75cfd0c5e..e176cd3233d69 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -8048,200 +8048,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: .cfi_offset w29, -16 -; CHECK-SD-NEXT: ldr b0, [sp, #208] +; CHECK-SD-NEXT: add x8, sp, #272 +; CHECK-SD-NEXT: ldr b4, [sp, #208] +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: ldr b5, [x8] ; CHECK-SD-NEXT: add x8, sp, #216 -; CHECK-SD-NEXT: add x9, sp, #272 -; CHECK-SD-NEXT: ldr b2, [sp, #80] -; CHECK-SD-NEXT: ldr b4, [sp, #976] -; CHECK-SD-NEXT: ldr b6, [sp, #720] -; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-SD-NEXT: add x9, sp, #280 +; CHECK-SD-NEXT: ld1 { v4.b }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #224 -; CHECK-SD-NEXT: fmov s16, w0 -; CHECK-SD-NEXT: ldr b17, [sp, #848] -; CHECK-SD-NEXT: add x10, sp, #24 -; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 -; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8] +; CHECK-SD-NEXT: add x12, sp, #256 +; CHECK-SD-NEXT: mov v0.b[1], w1 +; CHECK-SD-NEXT: ld1 { v5.b }[1], [x9] +; CHECK-SD-NEXT: add x9, sp, #288 +; CHECK-SD-NEXT: ldr b6, [sp, #976] +; CHECK-SD-NEXT: add x13, sp, #984 +; CHECK-SD-NEXT: add x10, sp, #264 +; CHECK-SD-NEXT: ld1 { v4.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #232 -; CHECK-SD-NEXT: mov v16.b[1], w1 -; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8] +; CHECK-SD-NEXT: ldr b7, [sp, #720] +; CHECK-SD-NEXT: ld1 { v5.b }[2], [x9] +; CHECK-SD-NEXT: add x9, sp, #296 +; CHECK-SD-NEXT: ld1 { v6.b }[1], [x13] +; CHECK-SD-NEXT: mov v0.b[2], w2 +; CHECK-SD-NEXT: add x13, sp, #784 +; CHECK-SD-NEXT: add x11, sp, #328 +; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #240 -; CHECK-SD-NEXT: mov v16.b[2], w2 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #248 -; CHECK-SD-NEXT: mov v16.b[3], w3 -; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #256 -; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #264 -; CHECK-SD-NEXT: mov v16.b[4], w4 -; CHECK-SD-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-SD-NEXT: ldr b1, [x9] -; CHECK-SD-NEXT: add x8, sp, #280 -; CHECK-SD-NEXT: add x9, sp, #88 -; CHECK-SD-NEXT: mov v16.b[5], w5 -; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #288 -; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #296 -; CHECK-SD-NEXT: mov v16.b[6], w6 -; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #304 -; CHECK-SD-NEXT: mov v16.b[7], w7 -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #312 -; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #320 -; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #328 -; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-NEXT: add x8, sp, #96 -; CHECK-SD-NEXT: add x9, sp, #144 -; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #104 -; CHECK-SD-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: ldr b17, [x13] +; CHECK-SD-NEXT: ld1 { v5.b }[3], [x9] +; CHECK-SD-NEXT: add x9, sp, #304 +; CHECK-SD-NEXT: add x13, sp, #1064 +; CHECK-SD-NEXT: add x14, sp, #1080 ; CHECK-SD-NEXT: movi v1.16b, #1 -; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #112 -; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #120 -; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #128 -; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #136 -; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] -; CHECK-SD-NEXT: ldr b3, [x9] -; CHECK-SD-NEXT: add x8, sp, #152 -; CHECK-SD-NEXT: add x9, sp, #984 -; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #160 -; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #168 -; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #176 -; CHECK-SD-NEXT: ld1 { v3.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #184 -; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #192 -; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #200 -; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8] -; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] -; CHECK-SD-NEXT: add x8, sp, #992 -; CHECK-SD-NEXT: add x9, sp, #1040 -; CHECK-SD-NEXT: ld1 { v4.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #1000 -; CHECK-SD-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #1008 +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 ; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #1016 +; CHECK-SD-NEXT: add x8, sp, #248 +; CHECK-SD-NEXT: mov v0.b[3], w3 +; CHECK-SD-NEXT: ld1 { v5.b }[4], [x9] +; CHECK-SD-NEXT: add x9, sp, #312 +; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 ; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #1024 -; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #1032 -; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8] -; CHECK-SD-NEXT: ldr b5, [x9] -; CHECK-SD-NEXT: add x8, sp, #1048 -; CHECK-SD-NEXT: add x9, sp, #728 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #1056 -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #1064 -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #1072 -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #1080 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #1088 +; CHECK-SD-NEXT: add x8, sp, #320 +; CHECK-SD-NEXT: mov v0.b[4], w4 +; CHECK-SD-NEXT: ld1 { v5.b }[5], [x9] +; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: ld1 { v4.b }[6], [x12] +; CHECK-SD-NEXT: add x12, sp, #1040 +; CHECK-SD-NEXT: ldr b16, [x12] +; CHECK-SD-NEXT: add x12, sp, #1048 ; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #1096 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9] -; CHECK-SD-NEXT: add x8, sp, #736 -; CHECK-SD-NEXT: add x9, sp, #784 -; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #744 -; CHECK-SD-NEXT: zip1 v4.2d, v4.2d, v5.2d -; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 -; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #752 -; CHECK-SD-NEXT: sdot v19.4s, v4.16b, v1.16b -; CHECK-SD-NEXT: sdot v5.4s, v0.16b, v1.16b -; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #760 -; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #768 -; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #776 +; CHECK-SD-NEXT: mov v0.b[5], w5 +; CHECK-SD-NEXT: add x8, sp, #24 +; CHECK-SD-NEXT: ld1 { v16.b }[1], [x12] +; CHECK-SD-NEXT: ld1 { v4.b }[7], [x10] +; CHECK-SD-NEXT: add x10, sp, #992 +; CHECK-SD-NEXT: add x12, sp, #1056 +; CHECK-SD-NEXT: ld1 { v6.b }[2], [x10] +; CHECK-SD-NEXT: add x10, sp, #728 +; CHECK-SD-NEXT: ld1 { v7.b }[1], [x10] +; CHECK-SD-NEXT: ld1 { v5.b }[7], [x11] +; CHECK-SD-NEXT: add x11, sp, #1000 +; CHECK-SD-NEXT: ld1 { v16.b }[2], [x12] +; CHECK-SD-NEXT: add x12, sp, #792 +; CHECK-SD-NEXT: mov v0.b[6], w6 +; CHECK-SD-NEXT: ld1 { v17.b }[1], [x12] +; CHECK-SD-NEXT: ld1 { v6.b }[3], [x11] +; CHECK-SD-NEXT: add x12, sp, #736 +; CHECK-SD-NEXT: ld1 { v7.b }[2], [x12] +; CHECK-SD-NEXT: add x10, sp, #1008 +; CHECK-SD-NEXT: add x11, sp, #1072 +; CHECK-SD-NEXT: ld1 { v16.b }[3], [x13] +; CHECK-SD-NEXT: add x13, sp, #800 +; CHECK-SD-NEXT: zip1 v5.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: ld1 { v17.b }[2], [x13] +; CHECK-SD-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-SD-NEXT: add x13, sp, #808 +; CHECK-SD-NEXT: mov v0.b[7], w7 +; CHECK-SD-NEXT: add x10, sp, #1016 +; CHECK-SD-NEXT: add x12, sp, #32 +; CHECK-SD-NEXT: ld1 { v16.b }[4], [x11] +; CHECK-SD-NEXT: add x11, sp, #744 +; CHECK-SD-NEXT: ldr b4, [sp, #80] +; CHECK-SD-NEXT: ld1 { v7.b }[3], [x11] +; CHECK-SD-NEXT: ld1 { v17.b }[3], [x13] +; CHECK-SD-NEXT: ld1 { v6.b }[5], [x10] +; CHECK-SD-NEXT: add x10, sp, #752 +; CHECK-SD-NEXT: add x11, sp, #816 +; CHECK-SD-NEXT: add x13, sp, #1088 +; CHECK-SD-NEXT: ld1 { v16.b }[5], [x14] +; CHECK-SD-NEXT: ld1 { v0.b }[8], [x9] +; CHECK-SD-NEXT: add x9, sp, #1024 +; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10] +; CHECK-SD-NEXT: ld1 { v17.b }[4], [x11] +; CHECK-SD-NEXT: ld1 { v6.b }[6], [x9] +; CHECK-SD-NEXT: add x9, sp, #760 +; CHECK-SD-NEXT: add x10, sp, #824 +; CHECK-SD-NEXT: add x11, sp, #1096 +; CHECK-SD-NEXT: ld1 { v16.b }[6], [x13] +; CHECK-SD-NEXT: ld1 { v0.b }[9], [x8] +; CHECK-SD-NEXT: add x8, sp, #1032 +; CHECK-SD-NEXT: ld1 { v7.b }[5], [x9] +; CHECK-SD-NEXT: ld1 { v17.b }[5], [x10] ; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8] -; CHECK-SD-NEXT: ldr b7, [x9] -; CHECK-SD-NEXT: add x8, sp, #792 -; CHECK-SD-NEXT: add x9, sp, #856 -; CHECK-SD-NEXT: ld1 { v7.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #800 -; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #808 -; CHECK-SD-NEXT: ld1 { v7.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #816 -; CHECK-SD-NEXT: ld1 { v7.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #824 -; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #832 -; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #840 -; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8] -; CHECK-SD-NEXT: ld1 { v17.b }[1], [x9] -; CHECK-SD-NEXT: add x8, sp, #864 -; CHECK-SD-NEXT: add x9, sp, #16 -; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9] -; CHECK-SD-NEXT: add x9, sp, #912 -; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #872 -; CHECK-SD-NEXT: zip1 v0.2d, v6.2d, v7.2d -; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10] -; CHECK-SD-NEXT: ld1 { v17.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #880 -; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b -; CHECK-SD-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #888 -; CHECK-SD-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #896 -; CHECK-SD-NEXT: ld1 { v17.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #904 -; CHECK-SD-NEXT: ld1 { v17.b }[7], [x8] -; CHECK-SD-NEXT: ldr b18, [x9] -; CHECK-SD-NEXT: add x8, sp, #920 -; CHECK-SD-NEXT: ld1 { v18.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #32 -; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8] -; CHECK-SD-NEXT: add x8, sp, #928 -; CHECK-SD-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-SD-NEXT: add x9, sp, #768 +; CHECK-SD-NEXT: add x10, sp, #832 ; CHECK-SD-NEXT: add x8, sp, #40 -; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8] -; CHECK-SD-NEXT: add x8, sp, #936 -; CHECK-SD-NEXT: ld1 { v18.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #48 -; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8] -; CHECK-SD-NEXT: add x8, sp, #944 -; CHECK-SD-NEXT: ld1 { v18.b }[4], [x8] +; CHECK-SD-NEXT: ld1 { v16.b }[7], [x11] +; CHECK-SD-NEXT: ld1 { v0.b }[10], [x12] +; CHECK-SD-NEXT: add x12, sp, #144 +; CHECK-SD-NEXT: ld1 { v7.b }[6], [x9] +; CHECK-SD-NEXT: ld1 { v17.b }[6], [x10] +; CHECK-SD-NEXT: add x9, sp, #776 +; CHECK-SD-NEXT: add x10, sp, #840 +; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b +; CHECK-SD-NEXT: ldr b5, [x12] +; CHECK-SD-NEXT: zip1 v6.2d, v6.2d, v16.2d +; CHECK-SD-NEXT: ld1 { v0.b }[11], [x8] +; CHECK-SD-NEXT: add x8, sp, #88 +; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9] +; CHECK-SD-NEXT: ld1 { v17.b }[7], [x10] +; CHECK-SD-NEXT: add x10, sp, #912 +; CHECK-SD-NEXT: ldr b16, [x10] +; CHECK-SD-NEXT: add x9, sp, #152 +; CHECK-SD-NEXT: ld1 { v4.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #856 +; CHECK-SD-NEXT: add x10, sp, #920 +; CHECK-SD-NEXT: add x11, sp, #48 +; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b +; CHECK-SD-NEXT: zip1 v6.2d, v7.2d, v17.2d +; CHECK-SD-NEXT: ldr b7, [sp, #848] +; CHECK-SD-NEXT: ld1 { v5.b }[1], [x9] +; CHECK-SD-NEXT: ld1 { v16.b }[1], [x10] +; CHECK-SD-NEXT: ld1 { v0.b }[12], [x11] +; CHECK-SD-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-SD-NEXT: add x9, sp, #96 +; CHECK-SD-NEXT: add x10, sp, #160 +; CHECK-SD-NEXT: ld1 { v4.b }[2], [x9] +; CHECK-SD-NEXT: add x9, sp, #864 +; CHECK-SD-NEXT: add x11, sp, #928 ; CHECK-SD-NEXT: add x8, sp, #56 -; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8] -; CHECK-SD-NEXT: add x8, sp, #952 -; CHECK-SD-NEXT: ld1 { v18.b }[5], [x8] +; CHECK-SD-NEXT: ld1 { v5.b }[2], [x10] +; CHECK-SD-NEXT: ld1 { v16.b }[2], [x11] +; CHECK-SD-NEXT: ld1 { v7.b }[2], [x9] +; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8] +; CHECK-SD-NEXT: add x8, sp, #104 +; CHECK-SD-NEXT: add x9, sp, #168 +; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #872 +; CHECK-SD-NEXT: add x10, sp, #936 +; CHECK-SD-NEXT: ld1 { v5.b }[3], [x9] +; CHECK-SD-NEXT: add x9, sp, #112 +; CHECK-SD-NEXT: ld1 { v7.b }[3], [x8] +; CHECK-SD-NEXT: ld1 { v16.b }[3], [x10] +; CHECK-SD-NEXT: add x10, sp, #176 +; CHECK-SD-NEXT: ld1 { v4.b }[4], [x9] +; CHECK-SD-NEXT: add x9, sp, #880 +; CHECK-SD-NEXT: add x11, sp, #944 ; CHECK-SD-NEXT: add x8, sp, #64 -; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #960 -; CHECK-SD-NEXT: ld1 { v18.b }[6], [x8] +; CHECK-SD-NEXT: ld1 { v5.b }[4], [x10] +; CHECK-SD-NEXT: add x10, sp, #952 +; CHECK-SD-NEXT: ld1 { v7.b }[4], [x9] +; CHECK-SD-NEXT: ld1 { v16.b }[4], [x11] +; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8] +; CHECK-SD-NEXT: add x8, sp, #120 +; CHECK-SD-NEXT: add x9, sp, #184 +; CHECK-SD-NEXT: add x11, sp, #960 +; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #888 +; CHECK-SD-NEXT: ld1 { v5.b }[5], [x9] +; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8] +; CHECK-SD-NEXT: ld1 { v16.b }[5], [x10] +; CHECK-SD-NEXT: add x9, sp, #128 +; CHECK-SD-NEXT: add x10, sp, #192 ; CHECK-SD-NEXT: add x8, sp, #72 -; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8] -; CHECK-SD-NEXT: add x8, sp, #968 -; CHECK-SD-NEXT: ld1 { v18.b }[7], [x8] -; CHECK-SD-NEXT: sdot v5.4s, v16.16b, v1.16b -; CHECK-SD-NEXT: zip1 v0.2d, v17.2d, v18.2d -; CHECK-SD-NEXT: sdot v5.4s, v2.16b, v1.16b -; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b -; CHECK-SD-NEXT: add v0.4s, v5.4s, v19.4s +; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v4.b }[6], [x9] +; CHECK-SD-NEXT: add x9, sp, #896 +; CHECK-SD-NEXT: ld1 { v5.b }[6], [x10] +; CHECK-SD-NEXT: ld1 { v7.b }[6], [x9] +; CHECK-SD-NEXT: ld1 { v16.b }[6], [x11] +; CHECK-SD-NEXT: ld1 { v0.b }[15], [x8] +; CHECK-SD-NEXT: add x8, sp, #136 +; CHECK-SD-NEXT: add x9, sp, #200 +; CHECK-SD-NEXT: add x10, sp, #968 +; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-SD-NEXT: add x8, sp, #904 +; CHECK-SD-NEXT: ld1 { v5.b }[7], [x9] +; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v16.b }[7], [x10] +; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: zip1 v0.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: zip1 v4.2d, v7.2d, v16.2d +; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b +; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s ; CHECK-SD-NEXT: addv s0, v0.4s ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/nontemporal-store.ll b/llvm/test/CodeGen/AArch64/nontemporal-store.ll index 677ae02417510..1ac7ec3d180c7 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal-store.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-store.ll @@ -683,43 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) { ; ; CHECK-BE-LABEL: test_stnp_v17f32: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-BE-NEXT: // kill: def $s5 killed $s5 def $q5 +; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-BE-NEXT: add x9, sp, #52 +; CHECK-BE-NEXT: add x10, sp, #20 +; CHECK-BE-NEXT: // kill: def $s6 killed $s6 def $q6 +; CHECK-BE-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-BE-NEXT: add x8, sp, #12 -; CHECK-BE-NEXT: add x9, sp, #20 -; CHECK-BE-NEXT: ldr s16, [sp, #36] ; CHECK-BE-NEXT: mov v0.s[1], v1.s[0] -; CHECK-BE-NEXT: ldr s1, [sp, #4] ; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] -; CHECK-BE-NEXT: add x10, sp, #52 -; CHECK-BE-NEXT: // kill: def $s6 killed $s6 def $q6 -; CHECK-BE-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-BE-NEXT: ldr s5, [sp, #36] +; CHECK-BE-NEXT: ldr s16, [x9] +; CHECK-BE-NEXT: ldr s1, [sp, #4] +; CHECK-BE-NEXT: ldr s17, [x10] +; CHECK-BE-NEXT: add x9, sp, #44 +; CHECK-BE-NEXT: add x10, sp, #60 ; CHECK-BE-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-BE-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-BE-NEXT: ld1 { v1.s }[1], [x8] -; CHECK-BE-NEXT: ldr s5, [x9] -; CHECK-BE-NEXT: add x8, sp, #28 -; CHECK-BE-NEXT: add x9, sp, #44 -; CHECK-BE-NEXT: ld1 { v5.s }[1], [x8] -; CHECK-BE-NEXT: ld1 { v16.s }[1], [x9] -; CHECK-BE-NEXT: ldr s17, [x10] -; CHECK-BE-NEXT: add x8, sp, #60 +; CHECK-BE-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-BE-NEXT: add x9, sp, #28 +; CHECK-BE-NEXT: ld1 { v16.s }[1], [x10] ; CHECK-BE-NEXT: mov v4.s[2], v6.s[0] ; CHECK-BE-NEXT: mov v0.s[2], v2.s[0] -; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] -; CHECK-BE-NEXT: ldr s2, [sp, #68] -; CHECK-BE-NEXT: add x8, x0, #32 -; CHECK-BE-NEXT: zip1 v1.2d, v1.2d, v5.2d -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: str s2, [x0, #64] -; CHECK-BE-NEXT: zip1 v5.2d, v16.2d, v17.2d +; CHECK-BE-NEXT: ld1 { v1.s }[1], [x8] +; CHECK-BE-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-BE-NEXT: add x8, x0, #48 +; CHECK-BE-NEXT: add x9, x0, #32 +; CHECK-BE-NEXT: zip1 v2.2d, v5.2d, v16.2d +; CHECK-BE-NEXT: ldr s5, [sp, #68] +; CHECK-BE-NEXT: zip1 v1.2d, v1.2d, v17.2d ; CHECK-BE-NEXT: mov v4.s[3], v7.s[0] ; CHECK-BE-NEXT: mov v0.s[3], v3.s[0] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: str s5, [x0, #64] +; CHECK-BE-NEXT: st1 { v2.4s }, [x8] ; CHECK-BE-NEXT: add x8, x0, #16 -; CHECK-BE-NEXT: st1 { v5.4s }, [x9] +; CHECK-BE-NEXT: st1 { v1.4s }, [x9] ; CHECK-BE-NEXT: st1 { v4.4s }, [x8] ; CHECK-BE-NEXT: st1 { v0.4s }, [x0] ; CHECK-BE-NEXT: ret diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 484372865b22a..4b5862b046b14 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -521,7 +521,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_sub_i32 s1, 0, s7 ; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0 ; GFX908-NEXT: v_mov_b32_e32 v17, 0 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX908-NEXT: v_rcp_f32_e32 v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -684,7 +684,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_sub_i32 s1, 0, s7 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index 62059cd989ba2..a2e3c9aa8acc5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -140,7 +140,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) { ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 ; GCN-NEXT: s_mov_b32 s4, 0xf4240 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -222,7 +222,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) { ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 ; GCN-NEXT: s_mov_b32 s4, 0xf4240 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 018eb779fc815..1659ca62a0516 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -45,7 +45,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX6-NEXT: s_sub_i32 s2, 0, s5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 @@ -76,7 +76,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX6-NEXT: s_sub_i32 s2, 0, s5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 @@ -170,7 +170,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 @@ -247,7 +247,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: s_abs_i32 s7, s4 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -285,7 +285,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: s_sub_i32 s5, 0, s4 ; GFX9-NEXT: s_xor_b32 s3, s2, s3 ; GFX9-NEXT: s_abs_i32 s2, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -364,7 +364,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX6-NEXT: s_sub_i32 s2, 0, s5 ; GFX6-NEXT: s_abs_i32 s6, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -398,7 +398,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: s_sub_i32 s5, 0, s3 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 ; GFX9-NEXT: s_abs_i32 s2, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 @@ -457,7 +457,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -478,7 +478,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 @@ -527,7 +527,7 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: s_and_b32 s0, s6, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -550,7 +550,7 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: s_and_b32 s0, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -606,7 +606,7 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: s_sext_i32_i16 s5, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -631,7 +631,7 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -687,7 +687,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: s_sext_i32_i16 s2, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX6-NEXT: s_xor_b32 s2, s2, s7 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 ; GFX6-NEXT: s_or_b32 s4, s2, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -715,7 +715,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: s_sext_i32_i16 s2, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: s_xor_b32 s2, s2, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -766,7 +766,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v0 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -784,7 +784,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -830,7 +830,7 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v0 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX6-NEXT: s_lshr_b32 s2, s6, 8 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 @@ -851,7 +851,7 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 @@ -908,7 +908,7 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX6-NEXT: s_sext_i32_i8 s5, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -933,7 +933,7 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: s_sext_i32_i8 s2, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -989,7 +989,7 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX6-NEXT: s_sext_i32_i8 s3, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GFX6-NEXT: s_xor_b32 s2, s3, s2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 ; GFX6-NEXT: s_lshr_b32 s4, s6, 8 ; GFX6-NEXT: s_or_b32 s5, s2, 1 @@ -1018,7 +1018,7 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: s_sext_i32_i8 s3, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GFX9-NEXT: s_xor_b32 s2, s3, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_lshr_b32 s4, s6, 8 ; GFX9-NEXT: s_or_b32 s5, s2, 1 @@ -1184,13 +1184,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_sub_i32 s0, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s15 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1281,8 +1281,8 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GFX9-NEXT: s_sub_i32 s2, 0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_i32 s3, s3, s4 ; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3 ; GFX9-NEXT: s_add_i32 s4, s4, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2 +; GFX9-NEXT: v_rcp_f32_e32 v0, v2 ; GFX9-NEXT: s_mul_hi_u32 s3, s9, s4 ; GFX9-NEXT: s_mul_i32 s4, s3, s13 ; GFX9-NEXT: s_sub_i32 s4, s9, s4 @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5 ; GFX9-NEXT: s_mul_i32 s5, s4, s14 ; GFX9-NEXT: s_sub_i32 s5, s10, s5 @@ -1502,8 +1502,8 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX6-NEXT: s_sub_i32 s0, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -1524,7 +1524,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_cselect_b32 s6, s1, s0 ; GFX6-NEXT: s_sub_i32 s0, 0, s13 ; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 @@ -1542,7 +1542,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_cselect_b32 s7, s1, s0 ; GFX6-NEXT: s_sub_i32 s0, 0, s14 ; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 @@ -1590,9 +1590,9 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GFX9-NEXT: s_sub_i32 s2, 0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -1632,7 +1632,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5 ; GFX9-NEXT: s_mul_i32 s4, s4, s14 ; GFX9-NEXT: s_sub_i32 s4, s10, s4 @@ -1849,7 +1849,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_xor_b32 s2, s8, s12 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 @@ -1873,7 +1873,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_xor_b32 s6, s9, s13 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -1900,7 +1900,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_sub_i32 s7, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX6-NEXT: v_rcp_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, s7, v4 @@ -1925,7 +1925,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_sub_i32 s0, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v6 +; GFX6-NEXT: v_rcp_f32_e32 v1, v6 ; GFX6-NEXT: s_abs_i32 s1, s11 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -1970,7 +1970,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: s_abs_i32 s2, s8 ; GFX9-NEXT: s_xor_b32 s1, s8, s12 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s1, s1, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_sub_i32 s8, s0, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s6, s9 ; GFX9-NEXT: s_xor_b32 s3, s9, s13 ; GFX9-NEXT: s_ashr_i32 s3, s3, 31 @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_xor_b32 s0, s0, s3 ; GFX9-NEXT: s_sub_i32 s7, 0, s1 ; GFX9-NEXT: s_sub_i32 s3, s0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s6, s10 ; GFX9-NEXT: s_xor_b32 s2, s10, s14 ; GFX9-NEXT: s_ashr_i32 s2, s2, 31 @@ -2047,7 +2047,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_xor_b32 s5, s6, s2 ; GFX9-NEXT: s_sub_i32 s6, 0, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-NEXT: s_sub_i32 s2, s5, s2 ; GFX9-NEXT: s_abs_i32 s4, s11 ; GFX9-NEXT: s_xor_b32 s3, s11, s15 @@ -2247,7 +2247,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_ashr_i32 s2, s8, 31 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 @@ -2269,7 +2269,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_sub_i32 s3, 0, s1 ; GFX6-NEXT: s_xor_b32 s0, s0, s2 ; GFX6-NEXT: s_sub_i32 s7, s0, s2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2292,7 +2292,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_sub_i32 s2, 0, s1 ; GFX6-NEXT: s_xor_b32 s0, s0, s6 ; GFX6-NEXT: s_sub_i32 s6, s0, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_ashr_i32 s8, s10, 31 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2315,7 +2315,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: s_sub_i32 s0, 0, s10 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s7 @@ -2356,7 +2356,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: s_abs_i32 s2, s8 ; GFX9-NEXT: s_ashr_i32 s1, s8, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 @@ -2377,7 +2377,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_sub_i32 s8, s0, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s6, s9 ; GFX9-NEXT: s_ashr_i32 s3, s9, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2400,7 +2400,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_xor_b32 s0, s0, s3 ; GFX9-NEXT: s_sub_i32 s7, 0, s1 ; GFX9-NEXT: s_sub_i32 s3, s0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s6, s10 ; GFX9-NEXT: s_ashr_i32 s2, s10, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2423,7 +2423,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_xor_b32 s5, s6, s2 ; GFX9-NEXT: s_sub_i32 s6, 0, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GFX9-NEXT: v_rcp_f32_e32 v2, v1 ; GFX9-NEXT: s_sub_i32 s2, s5, s2 ; GFX9-NEXT: s_abs_i32 s4, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -2554,10 +2554,10 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_rcp_f32_e32 v5, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 @@ -2572,7 +2572,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_rcp_f32_e32 v6, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_lshr_b32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: v_rcp_f32_e32 v7, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2611,12 +2611,12 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: s_and_b32 s6, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_rcp_f32_e32 v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s0, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -2628,7 +2628,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_rcp_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: s_lshr_b32 s0, s3, 16 @@ -2639,7 +2639,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 +; GFX9-NEXT: v_rcp_f32_e32 v8, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 @@ -2769,10 +2769,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_rcp_f32_e32 v5, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 @@ -2790,7 +2790,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 ; GFX6-NEXT: s_and_b32 s5, s9, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX6-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 ; GFX6-NEXT: s_lshr_b32 s4, s11, 16 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 @@ -2798,7 +2798,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: s_lshr_b32 s5, s9, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: v_rcp_f32_e32 v7, v4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 @@ -2836,10 +2836,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_rcp_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s4, s3, 0xffff @@ -2854,7 +2854,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_rcp_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 @@ -2866,7 +2866,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 +; GFX9-NEXT: v_rcp_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc @@ -3006,7 +3006,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_sext_i32_i16 s5, s8 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -3021,7 +3021,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -3037,7 +3037,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: v_rcp_f32_e32 v4, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -3053,7 +3053,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 ; GFX6-NEXT: s_ashr_i32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX6-NEXT: v_rcp_f32_e32 v5, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -3085,7 +3085,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: s_sext_i32_i16 s5, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30 ; GFX9-NEXT: s_or_b32 s8, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: s_ashr_i32 s0, s0, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_sext_i32_i16 s2, s3 @@ -3116,7 +3116,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 ; GFX9-NEXT: s_sext_i32_i16 s0, s1 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX9-NEXT: v_rcp_f32_e32 v5, v0 ; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s0, s0, 1 @@ -3132,7 +3132,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 ; GFX9-NEXT: s_ashr_i32 s0, s1, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 +; GFX9-NEXT: v_rcp_f32_e32 v6, v0 ; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s2, s0, 1 @@ -3276,7 +3276,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_sext_i32_i16 s5, s8 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -3292,7 +3292,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_ashr_i32 s5, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_lshr_b32 s6, s8, 16 @@ -3314,7 +3314,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX6-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -3331,7 +3331,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_ashr_i32 s5, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_rcp_f32_e32 v5, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_lshr_b32 s6, s9, 16 ; GFX6-NEXT: s_lshr_b32 s7, s11, 16 @@ -3367,7 +3367,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: s_sext_i32_i16 s9, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30 ; GFX9-NEXT: s_or_b32 s10, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -3384,7 +3384,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: s_sext_i32_i16 s8, s1 @@ -3401,7 +3401,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 ; GFX9-NEXT: s_xor_b32 s0, s8, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX9-NEXT: v_rcp_f32_e32 v5, v3 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s0, s0, 1 ; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 @@ -3418,7 +3418,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 ; GFX9-NEXT: s_ashr_i32 s2, s1, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_rcp_f32_e32 v6, v4 ; GFX9-NEXT: s_xor_b32 s0, s2, s3 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s4, s0, 1 @@ -3475,7 +3475,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v0 ; GFX6-NEXT: s_and_b32 s4, s6, 7 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -3497,7 +3497,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 @@ -3545,7 +3545,7 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v0 ; GFX6-NEXT: s_and_b32 s3, s6, 7 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 ; GFX6-NEXT: s_lshr_b32 s2, s6, 8 @@ -3569,7 +3569,7 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s0, s2, 0x30008 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: s_and_b32 s1, s2, 7 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 ; GFX9-NEXT: s_lshr_b32 s0, s2, 8 @@ -3630,7 +3630,7 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX6-NEXT: s_bfe_i32 s5, s6, 0x30000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -3656,7 +3656,7 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x30000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -3713,7 +3713,7 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX6-NEXT: s_bfe_i32 s3, s6, 0x30000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GFX6-NEXT: s_xor_b32 s2, s3, s2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 ; GFX6-NEXT: s_lshr_b32 s4, s6, 8 ; GFX6-NEXT: s_or_b32 s5, s2, 1 @@ -3742,7 +3742,7 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: s_bfe_i32 s1, s2, 0x30000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: s_or_b32 s6, s0, 1 @@ -3845,10 +3845,10 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_rcp_f32_e32 v5, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_rcp_f32_e32 v6, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -3890,10 +3890,10 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_rcp_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s0, s3, 0xffff @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_rcp_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 @@ -4013,10 +4013,10 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_rcp_f32_e32 v5, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 @@ -4031,7 +4031,7 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: s_and_b32 s6, s9, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_rcp_f32_e32 v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 @@ -4065,9 +4065,9 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_and_b32 s8, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_rcp_f32_e32 v5, v1 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 @@ -4082,7 +4082,7 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX9-NEXT: v_rcp_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 @@ -4197,7 +4197,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_sext_i32_i16 s5, s8 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -4212,7 +4212,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -4228,7 +4228,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: v_rcp_f32_e32 v4, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -4258,7 +4258,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_sext_i32_i16 s5, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30 ; GFX9-NEXT: s_or_b32 s8, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -4273,7 +4273,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_ashr_i32 s0, s0, 16 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_sext_i32_i16 s2, s3 @@ -4289,7 +4289,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 ; GFX9-NEXT: s_sext_i32_i16 s0, s1 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX9-NEXT: v_rcp_f32_e32 v5, v0 ; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s2, s0, 1 @@ -4406,7 +4406,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_sext_i32_i16 s5, s8 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -4422,7 +4422,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_ashr_i32 s5, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_lshr_b32 s6, s8, 16 @@ -4442,7 +4442,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_sext_i32_i16 s5, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX6-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: s_or_b32 s7, s4, 1 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_sext_i32_i16 s9, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30 ; GFX9-NEXT: s_or_b32 s10, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -4491,7 +4491,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -4507,7 +4507,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_sext_i32_i16 s3, s1 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v2 ; GFX9-NEXT: s_xor_b32 s0, s3, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s4, s0, 1 @@ -4614,7 +4614,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_and_b32 s6, s10, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], 30 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX6-NEXT: s_mov_b32 s1, s9 @@ -4622,7 +4622,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_rcp_f32_e32 v5, v3 ; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 @@ -4633,7 +4633,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_mad_f32 v4, -v0, v3, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s8 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v1 +; GFX6-NEXT: v_rcp_f32_e32 v6, v1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc @@ -4668,13 +4668,13 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_bfe_u32 s8, s2, 0xf000f ; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 30 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_rcp_f32_e32 v6, v4 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 @@ -4686,7 +4686,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mad_f32 v5, -v0, v4, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v1 +; GFX9-NEXT: v_rcp_f32_e32 v7, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc @@ -4797,7 +4797,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_and_b32 s6, s10, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], 30 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff ; GFX6-NEXT: s_bfe_u32 s6, s4, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 @@ -4810,7 +4810,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v3 +; GFX6-NEXT: v_rcp_f32_e32 v2, v3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX6-NEXT: s_lshr_b32 s4, s4, 15 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v0 @@ -4819,7 +4819,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_trunc_f32_e32 v0, v0 ; GFX6-NEXT: v_mad_f32 v1, -v0, v3, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 +; GFX6-NEXT: v_rcp_f32_e32 v6, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 ; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc @@ -4858,7 +4858,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_and_b32 s8, s4, 0x7fff -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 30 ; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 @@ -4870,13 +4870,13 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_and_b32 s3, s4, 0x7fff ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_rcp_f32_e32 v6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s8 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 +; GFX9-NEXT: v_rcp_f32_e32 v7, v3 ; GFX9-NEXT: v_mad_f32 v5, -v1, v4, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 @@ -5002,7 +5002,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], 30 ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_xor_b32 s5, s5, s7 ; GFX6-NEXT: s_ashr_i32 s5, s5, 30 ; GFX6-NEXT: s_or_b32 s5, s5, 1 @@ -5018,7 +5018,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: v_rcp_f32_e32 v3, v0 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s7, s4, 1 @@ -5034,7 +5034,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: v_rcp_f32_e32 v4, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -5070,7 +5070,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_bfe_i32 s5, s2, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX9-NEXT: s_xor_b32 s3, s5, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s3, s3, 30 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], 30 ; GFX9-NEXT: s_or_b32 s3, s3, 1 @@ -5085,7 +5085,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_bfe_i32 s2, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v4, v0 ; GFX9-NEXT: s_xor_b32 s2, s2, s5 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_add_u32_e32 v3, s3, v3 @@ -5102,7 +5102,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 ; GFX9-NEXT: s_bfe_i32 s2, s4, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX9-NEXT: v_rcp_f32_e32 v5, v0 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 @@ -5227,7 +5227,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_bfe_i32 s12, s10, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s12 ; GFX6-NEXT: s_xor_b32 s5, s12, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], 30 ; GFX6-NEXT: s_ashr_i32 s5, s5, 30 ; GFX6-NEXT: s_and_b32 s7, s6, 0x7fff @@ -5249,7 +5249,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v0 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s10, s4, 1 @@ -5266,7 +5266,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX6-NEXT: v_rcp_f32_e32 v4, v1 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s14 @@ -5304,7 +5304,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], 30 ; GFX9-NEXT: s_bfe_i32 s7, s2, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v0 ; GFX9-NEXT: s_xor_b32 s5, s7, s5 ; GFX9-NEXT: s_ashr_i32 s5, s5, 30 ; GFX9-NEXT: s_lshr_b32 s3, s2, 15 @@ -5325,7 +5325,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX9-NEXT: v_rcp_f32_e32 v3, v1 ; GFX9-NEXT: s_xor_b32 s5, s6, s5 ; GFX9-NEXT: s_ashr_i32 s5, s5, 30 ; GFX9-NEXT: s_or_b32 s5, s5, 1 @@ -5342,7 +5342,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_bfe_i32 s4, s4, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v2 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30 ; GFX9-NEXT: s_or_b32 s6, s4, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s13 @@ -5656,8 +5656,8 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s1, 0, s0 ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 @@ -5710,9 +5710,9 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -5994,8 +5994,8 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s6, 0, s2 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -6043,9 +6043,9 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -6461,7 +6461,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: s_sub_i32 s7, 0, s6 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 @@ -6488,7 +6488,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s6, 0, s2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_xor_b32 s3, s1, s3 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: s_abs_i32 s1, s1 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 @@ -6530,7 +6530,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3 ; GFX9-NEXT: s_abs_i32 s3, s0 ; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: s_ashr_i32 s0, s0, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -6555,7 +6555,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: s_xor_b32 s5, s6, s0 ; GFX9-NEXT: s_sub_i32 s6, 0, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s0, s5, s0 ; GFX9-NEXT: s_xor_b32 s4, s1, s7 ; GFX9-NEXT: s_abs_i32 s1, s1 @@ -6907,7 +6907,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_sub_i32 s6, 0, s2 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -6930,7 +6930,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s6, 0, s3 ; GFX6-NEXT: s_abs_i32 s8, s1 ; GFX6-NEXT: s_xor_b32 s2, s2, s0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_sub_i32 s0, s2, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 @@ -6968,7 +6968,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: s_abs_i32 s0, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -6990,7 +6990,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b32 s0, s0, s6 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: s_sub_i32 s5, 0, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s0, s0, s6 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 ; GFX9-NEXT: s_abs_i32 s1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll index 5f6a38018be20..7af5b0e177b7f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll @@ -17,7 +17,7 @@ define float @test_rootn_afn_f32(float %x, i32 %y) #0 { ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 32, vcc ; CHECK-NEXT: v_ldexp_f32 v4, |v0|, v4 ; CHECK-NEXT: v_log_f32_e32 v4, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_rcp_f32_e32 v2, v2 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x42000000 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; CHECK-NEXT: v_sub_f32_e32 v3, v4, v3 @@ -75,8 +75,8 @@ define <2 x float> @test_rootn_afn_v2f32(<2 x float> %x, <2 x i32> %y) #0 { ; CHECK-NEXT: v_log_f32_e32 v8, v8 ; CHECK-NEXT: v_ldexp_f32 v9, |v0|, v9 ; CHECK-NEXT: v_log_f32_e32 v9, v9 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CHECK-NEXT: v_rcp_f32_e32 v5, v5 +; CHECK-NEXT: v_rcp_f32_e32 v4, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; CHECK-NEXT: v_sub_f32_e32 v7, v8, v7 ; CHECK-NEXT: v_sub_f32_e32 v6, v9, v6 diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 25071eb767851..150d8cfe22cfd 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -577,7 +577,7 @@ define i32 @sdiv32(i32 %a, i32 %b) { ; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2 ; GFX9-NEXT: v_sub_u32_e32 v5, 0, v0 ; GFX9-NEXT: v_max_i32_e32 v5, v5, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 @@ -609,7 +609,7 @@ define i32 @udiv32(i32 %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v2 @@ -641,7 +641,7 @@ define i32 @srem32(i32 %a, i32 %b) { ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1 ; GFX9-NEXT: v_sub_u32_e32 v4, 0, v0 ; GFX9-NEXT: v_max_i32_e32 v4, v4, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -670,7 +670,7 @@ define i32 @urem32(i32 %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v2 @@ -1156,7 +1156,7 @@ define i64 @udiv64_known32(i64 %a, i64 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 6f9531ecfa1b8..1f751ecbe6645 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: s_sub_i32 s2, 0, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -53,7 +53,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s3, 0, s6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0 @@ -97,7 +97,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX11-NEXT: s_sub_i32 s3, 0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: s_sub_i32 s2, 0, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -201,7 +201,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s3, 0, s6 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v0 @@ -243,7 +243,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX11-NEXT: s_sub_i32 s3, 0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -309,7 +309,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_ashr_i32 s4, s6, 31 ; GFX9-NEXT: s_sub_i32 s5, 0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 @@ -352,7 +352,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_ashr_i32 s3, s3, 31 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: s_sub_i32 s4, 0, s2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s5, v0 @@ -398,7 +398,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: s_sub_i32 s4, 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -464,7 +464,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_sub_i32 s4, 0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 @@ -502,7 +502,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0 @@ -543,7 +543,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: s_sub_i32 s3, 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -605,7 +605,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s2 @@ -636,7 +636,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s2 @@ -667,7 +667,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB4_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -718,7 +718,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_and_b32 s2, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s3 @@ -751,7 +751,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s3 @@ -784,7 +784,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -838,7 +838,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_sext_i32_i16 s2, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 @@ -875,7 +875,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 @@ -912,7 +912,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -969,7 +969,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_sext_i32_i16 s2, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_sext_i32_i16 s6, s3 @@ -1008,7 +1008,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_sext_i32_i16 s4, s3 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index fe0892788ca84..33424b7f0d16e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -426,7 +426,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 @@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX90A-NEXT: s_sub_i32 s4, 0, s3 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 @@ -485,7 +485,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX10-NEXT: s_sub_i32 s5, 0, s3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0 @@ -516,7 +516,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-FLATSCR-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11-NEXT: s_sub_i32 s5, 0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -582,8 +582,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_s_rcp_f32 s4, s4 ; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.bcast.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.bcast.ll new file mode 100644 index 0000000000000..1498c0a57b96f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.bcast.ll @@ -0,0 +1,1105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s + +define i32 @v_permlane_bcast_b32_vss(i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_bcast_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_b32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) + ret i32 %v +} + +define i32 @v_permlane_bcast_b32_vii(i32 %src0) { +; GFX1250-LABEL: v_permlane_bcast_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_b32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 1, i32 2) + ret i32 %v +} + +define i32 @v_permlane_bcast_b32_vll(i32 %src0) { +; GFX1250-LABEL: v_permlane_bcast_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_b32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 100, i32 102) + ret i32 %v +} + +define i32 @v_permlane_bcast_b32_vvv(i32 %src0) { +; GFX1250-LABEL: v_permlane_bcast_b32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_b32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %tidx, i32 %tidy) + ret i32 %v +} + +define float @v_permlane_bcast_f32_vss(float %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_bcast_f32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 %src1, i32 %src2) + ret float %v +} + +define float @v_permlane_bcast_f32_vii(float %src0) { +; GFX1250-LABEL: v_permlane_bcast_f32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 1, i32 2) + ret float %v +} + +define float @v_permlane_bcast_f32_vll(float %src0) { +; GFX1250-LABEL: v_permlane_bcast_f32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 100, i32 102) + ret float %v +} + +define float @v_permlane_bcast_f32_vvv(float %src0) { +; GFX1250-LABEL: v_permlane_bcast_f32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 %tidx, i32 %tidy) + ret float %v +} + +define i64 @v_permlane_bcast_i64_vss(i64 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_bcast_i64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_i64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 %src1, i32 %src2) + ret i64 %v +} + +define i64 @v_permlane_bcast_i64_vii(i64 %src0) { +; GFX1250-LABEL: v_permlane_bcast_i64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_i64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 1, i32 2) + ret i64 %v +} + +define i64 @v_permlane_bcast_i64_vll(i64 %src0) { +; GFX1250-LABEL: v_permlane_bcast_i64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_i64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 100, i32 102) + ret i64 %v +} + +define i64 @v_permlane_bcast_i64_vvv(i64 %src0) { +; GFX1250-LABEL: v_permlane_bcast_i64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_i64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 %tidx, i32 %tidy) + ret i64 %v +} + +define double @v_permlane_bcast_f64_vss(double %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_bcast_f64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 %src1, i32 %src2) + ret double %v +} + +define double @v_permlane_bcast_f64_vii(double %src0) { +; GFX1250-LABEL: v_permlane_bcast_f64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 1, i32 2) + ret double %v +} + +define double @v_permlane_bcast_f64_vll(double %src0) { +; GFX1250-LABEL: v_permlane_bcast_f64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 100, i32 102) + ret double %v +} + +define double @v_permlane_bcast_f64_vvv(double %src0) { +; GFX1250-LABEL: v_permlane_bcast_f64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_f64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_bcast_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 %tidx, i32 %tidy) + ret double %v +} + +; does not work for GISEL +;define void @v_permlane_bcast_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) { +; %v = call bfloat @llvm.amdgcn.permlane.bcast.bf16(bfloat %src, i32 %src1, i32 %src2) +; store bfloat %v, ptr addrspace(1) %out, align 4 +; ret void +;} + +define void @v_permlane_bcast_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_bcast_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane.bcast.i16(i16 %src, i32 %src1, i32 %src2) + store i16 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_bcast_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_bcast_v2f16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b32 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane.bcast.v2f16(<2 x half> %src, i32 %src1, i32 %src2) + store <2 x half> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_v2f32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_v2f32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_bcast_v2f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_bcast_v2f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane.bcast.v2f32(<2 x float> %src, i32 %src1, i32 %src2) + store <2 x float> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_v7i32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_v7i32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_bcast_v7i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_bcast_v7i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane.bcast.v7i32(<7 x i32> %src, i32 %src1, i32 %src2) + store <7 x i32> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_v8i16: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_v8i16: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_bcast_v8i16: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_bcast_v8i16: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane.bcast.v8i16(<8 x i16> %src, i32 %src1, i32 %src2) + store <8 x i16> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_v2i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_v2i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_bcast_v2i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_bcast_v2i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane.bcast.v2i64(<2 x i64> %src, i32 %src1, i32 %src2) + store <2 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_v3i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_v3i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_bcast_v3i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_bcast_v3i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane.bcast.v2i64(<3 x i64> %src, i32 %src1, i32 %src2) + store <3 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_v4f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_v4f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_bcast_v4f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_bcast_v4f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane.bcast.v4f64(<4 x double> %src, i32 %src1, i32 %src2) + store <4 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_bcast_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_bcast_v8f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v17, v17, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v16, v16, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v15, v15, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v14, v14, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v13, v13, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v12, v12, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v11, v11, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v10, v10, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x3 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_bcast_v8f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v10, v10, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v11, v11, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v12, v12, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v13, v13, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v14, v14, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v15, v15, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v16, v16, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v17, v17, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x3 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_bcast_v8f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v17, v17, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v16, v16, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v15, v15, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v14, v14, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v13, v13, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v12, v12, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v11, v11, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v10, v10, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x3 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_bcast_v8f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v10, v10, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v11, v11, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v12, v12, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v13, v13, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v14, v14, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v15, v15, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v16, v16, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_bcast_b32 v17, v17, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x3 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane.bcast.v8f64(<8 x double> %src, i32 %src1, i32 %src2) + store <8 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.down.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.down.ll new file mode 100644 index 0000000000000..75548d5cc0594 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.down.ll @@ -0,0 +1,1105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s + +define i32 @v_permlane_down_b32_vss(i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_down_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_b32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) + ret i32 %v +} + +define i32 @v_permlane_down_b32_vii(i32 %src0) { +; GFX1250-LABEL: v_permlane_down_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_b32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 1, i32 2) + ret i32 %v +} + +define i32 @v_permlane_down_b32_vll(i32 %src0) { +; GFX1250-LABEL: v_permlane_down_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_b32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 100, i32 102) + ret i32 %v +} + +define i32 @v_permlane_down_b32_vvv(i32 %src0) { +; GFX1250-LABEL: v_permlane_down_b32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_b32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %tidx, i32 %tidy) + ret i32 %v +} + +define float @v_permlane_down_f32_vss(float %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_down_f32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 %src1, i32 %src2) + ret float %v +} + +define float @v_permlane_down_f32_vii(float %src0) { +; GFX1250-LABEL: v_permlane_down_f32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 1, i32 2) + ret float %v +} + +define float @v_permlane_down_f32_vll(float %src0) { +; GFX1250-LABEL: v_permlane_down_f32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 100, i32 102) + ret float %v +} + +define float @v_permlane_down_f32_vvv(float %src0) { +; GFX1250-LABEL: v_permlane_down_f32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 %tidx, i32 %tidy) + ret float %v +} + +define i64 @v_permlane_down_i64_vss(i64 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_down_i64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_i64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 %src1, i32 %src2) + ret i64 %v +} + +define i64 @v_permlane_down_i64_vii(i64 %src0) { +; GFX1250-LABEL: v_permlane_down_i64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_i64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 1, i32 2) + ret i64 %v +} + +define i64 @v_permlane_down_i64_vll(i64 %src0) { +; GFX1250-LABEL: v_permlane_down_i64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_i64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 100, i32 102) + ret i64 %v +} + +define i64 @v_permlane_down_i64_vvv(i64 %src0) { +; GFX1250-LABEL: v_permlane_down_i64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_i64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 %tidx, i32 %tidy) + ret i64 %v +} + +define double @v_permlane_down_f64_vss(double %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_down_f64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 %src1, i32 %src2) + ret double %v +} + +define double @v_permlane_down_f64_vii(double %src0) { +; GFX1250-LABEL: v_permlane_down_f64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_down_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 1, i32 2) + ret double %v +} + +define double @v_permlane_down_f64_vll(double %src0) { +; GFX1250-LABEL: v_permlane_down_f64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 100, i32 102) + ret double %v +} + +define double @v_permlane_down_f64_vvv(double %src0) { +; GFX1250-LABEL: v_permlane_down_f64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_f64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_down_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 %tidx, i32 %tidy) + ret double %v +} + +; does not work for GISEL +;define void @v_permlane_down_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) { +; %v = call bfloat @llvm.amdgcn.permlane.down.bf16(bfloat %src, i32 %src1, i32 %src2) +; store bfloat %v, ptr addrspace(1) %out, align 4 +; ret void +;} + +define void @v_permlane_down_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_down_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane.down.i16(i16 %src, i32 %src1, i32 %src2) + store i16 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_down_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_down_v2f16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b32 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane.down.v2f16(<2 x half> %src, i32 %src1, i32 %src2) + store <2 x half> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_down_v2f32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_down_v2f32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_down_v2f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_down_v2f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane.down.v2f32(<2 x float> %src, i32 %src1, i32 %src2) + store <2 x float> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_down_v7i32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_down_v7i32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_down_v7i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_down_v7i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane.down.v7i32(<7 x i32> %src, i32 %src1, i32 %src2) + store <7 x i32> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_down_v8i16: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_down_v8i16: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_down_v8i16: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_down_v8i16: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane.down.v8i16(<8 x i16> %src, i32 %src1, i32 %src2) + store <8 x i16> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_down_v2i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_down_v2i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_down_v2i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_down_v2i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane.down.v2i64(<2 x i64> %src, i32 %src1, i32 %src2) + store <2 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_down_v3i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_down_v3i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_down_v3i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_down_v3i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane.down.v2i64(<3 x i64> %src, i32 %src1, i32 %src2) + store <3 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_down_v4f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_down_v4f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_down_v4f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_down_v4f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane.down.v4f64(<4 x double> %src, i32 %src1, i32 %src2) + store <4 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_down_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_down_v8f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v17, v17, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v16, v16, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v15, v15, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v14, v14, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v13, v13, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v12, v12, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v11, v11, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v10, v10, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x3 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_down_v8f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v10, v10, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v11, v11, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v12, v12, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v13, v13, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v14, v14, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v15, v15, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v16, v16, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_down_b32 v17, v17, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x3 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_down_v8f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_down_b32 v17, v17, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v16, v16, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v15, v15, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v14, v14, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v13, v13, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v12, v12, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v11, v11, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v10, v10, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x3 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_down_v8f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_down_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v10, v10, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v11, v11, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v12, v12, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v13, v13, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v14, v14, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v15, v15, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v16, v16, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_down_b32 v17, v17, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x3 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane.down.v8f64(<8 x double> %src, i32 %src1, i32 %src2) + store <8 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll deleted file mode 100644 index 72a14536bebd4..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll +++ /dev/null @@ -1,440 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s - -define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX1250-LABEL: v_permlane_bcast_b32_vss: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s6 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_bcast_b32_vii: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 1, i32 2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_bcast_b32_vll: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_movk_i32 s2, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s2, 0x66 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 100, i32 102) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-SDAG-LABEL: v_permlane_bcast_b32_vvv: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v1, v1, s3, s2 -; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: v_permlane_bcast_b32_vvv: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4 -; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %tidx, i32 %tidy) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX1250-LABEL: v_permlane_down_b32_vss: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s6 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_down_b32_vii: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_down_b32 v0, v0, 1, 2 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 1, i32 2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_down_b32_vll: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_movk_i32 s2, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s2, 0x66 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 100, i32 102) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-SDAG-LABEL: v_permlane_down_b32_vvv: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-SDAG-NEXT: v_permlane_down_b32 v1, v1, s3, s2 -; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: v_permlane_down_b32_vvv: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_permlane_down_b32 v0, v0, s3, s4 -; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %tidx, i32 %tidy) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX1250-LABEL: v_permlane_up_b32_vss: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s6 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_up_b32_vii: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_up_b32 v0, v0, 1, 2 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 1, i32 2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_up_b32_vll: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_movk_i32 s2, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s2, 0x66 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 100, i32 102) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-SDAG-LABEL: v_permlane_up_b32_vvv: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-SDAG-NEXT: v_permlane_up_b32 v1, v1, s3, s2 -; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: v_permlane_up_b32_vvv: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_permlane_up_b32 v0, v0, s3, s4 -; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %tidx, i32 %tidy) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { -; GFX1250-LABEL: v_permlane_xor_b32_vss: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s6 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_xor_b32_vii: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 1, i32 2) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_xor_b32_vll: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_movk_i32 s2, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s2, 0x66 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 100, i32 102) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-SDAG-LABEL: v_permlane_xor_b32_vvv: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v1, v1, s3, s2 -; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: v_permlane_xor_b32_vvv: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v0, v0, s3, s4 -; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-GISEL-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %tidx, i32 %tidy) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX1250-LABEL: v_permlane_idx_gen_b32_vs: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, s3 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_idx_gen_b32_vi: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 1 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 1) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32 %src0) { -; GFX1250-LABEL: v_permlane_idx_gen_b32_vl: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 0x64 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 100) - store i32 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlane_idx_gen_b32_vv(ptr addrspace(1) %out) { -; GFX1250-LABEL: v_permlane_idx_gen_b32_vv: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_bfe_u32 v1, v0, 10, 10 -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1250-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, s2 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX1250-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %tidy = call i32 @llvm.amdgcn.workitem.id.y() - %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %tidx, i32 %tidy) - store i32 %v, ptr addrspace(1) %out - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.idx.gen.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.idx.gen.ll new file mode 100644 index 0000000000000..887c9cd3d8483 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.idx.gen.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13 %s + +define i32 @v_permlane_idx_gen_b32_vs(i32 %src0, i32 %src1) { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_idx_gen_b32_vs: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_idx_gen_b32 v0, v0, s0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1) + ret i32 %v +} + +define i32 @v_permlane_idx_gen_b32_vi(i32 %src0) { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_idx_gen_b32_vi: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_idx_gen_b32 v0, v0, 1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 1) + ret i32 %v +} + +define i32 @v_permlane_idx_gen_b32_vl(i32 %src0) { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vl: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 0x64 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_idx_gen_b32_vl: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_idx_gen_b32 v0, v0, 0x64 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 100) + ret i32 %v +} + +define i32 @v_permlane_idx_gen_b32_vv() { +; GFX1250-LABEL: v_permlane_idx_gen_b32_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_idx_gen_b32_vv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_bfe_u32 v0, v31, 10, 10 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_readfirstlane_b32 s0, v0 +; GFX13-NEXT: v_permlane_idx_gen_b32 v0, v1, s0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %tidx, i32 %tidy) + ret i32 %v +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 925e91022a27b..ed3d57d9fbda0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -5,6 +5,8 @@ ; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s ; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) @@ -47,6 +49,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -88,6 +102,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -177,6 +203,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -266,6 +320,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -305,6 +387,16 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vii_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -344,6 +436,16 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vii_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -421,6 +523,30 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vii_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vii_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -498,6 +624,30 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vii_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vii_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -542,6 +692,17 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vll_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -629,6 +790,34 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vll_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vll_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -672,6 +861,17 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vll_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -759,6 +959,34 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vll_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vll_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -862,6 +1090,37 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -935,6 +1194,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -1040,6 +1333,37 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -1113,6 +1437,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; GFX12-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -1192,6 +1550,32 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -1264,6 +1648,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX12-NEXT: v_permlane16_b32 v0, v0, s5, s4 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s5, s4 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s5, s4 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -1342,6 +1760,32 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -1414,6 +1858,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX12-NEXT: v_permlane16_b32 v0, v0, s5, s4 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s5, s4 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s5, s4 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -1496,6 +1974,32 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -1602,6 +2106,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -1684,6 +2222,32 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -1790,6 +2354,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -1832,6 +2430,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_fi_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -1921,6 +2531,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -1962,6 +2600,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_fi_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -2051,6 +2701,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -2092,6 +2770,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out ret void @@ -2181,6 +2871,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i64 %v, ptr addrspace(1) %out ret void @@ -2222,6 +2940,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) store float %v, ptr addrspace(1) %out ret void @@ -2311,6 +3041,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) store double %v, ptr addrspace(1) %out ret void @@ -2352,6 +3110,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_fi_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out ret void @@ -2441,6 +3211,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i64 %v, ptr addrspace(1) %out ret void @@ -2482,6 +3280,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_vss_fi_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) store float %v, ptr addrspace(1) %out ret void @@ -2571,6 +3381,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) store double %v, ptr addrspace(1) %out ret void @@ -2612,6 +3450,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -2653,6 +3503,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -2742,6 +3604,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -2831,6 +3721,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -2870,6 +3788,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vii_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -2909,6 +3837,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vii_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -2986,6 +3924,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vii_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vii_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -3063,6 +4025,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vii_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vii_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -3107,6 +4093,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vll_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -3150,6 +4147,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vll_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -3237,6 +4245,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vll_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vll_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -3324,6 +4360,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vll_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vll_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -3427,6 +4491,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -3532,6 +4627,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -3605,6 +4731,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -3678,6 +4838,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; GFX12-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false) @@ -3757,6 +4951,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -3835,6 +5055,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -3907,6 +5153,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s5, s4 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s5, s4 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s5, s4 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -3979,6 +5259,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s5, s4 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s5, s4 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s5, s4 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -4061,6 +5375,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -4143,6 +5483,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 +; GFX13-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX13-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -4249,6 +5615,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -4355,6 +5755,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s5, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -4397,6 +5831,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_fi_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out ret void @@ -4438,6 +5884,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_fi_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) store float %v, ptr addrspace(1) %out ret void @@ -4527,6 +5985,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i64 %v, ptr addrspace(1) %out ret void @@ -4616,6 +6102,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) store double %v, ptr addrspace(1) %out ret void @@ -4657,6 +6171,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out ret void @@ -4698,6 +6224,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) store float %v, ptr addrspace(1) %out ret void @@ -4787,6 +6325,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i64 %v, ptr addrspace(1) %out ret void @@ -4876,6 +6442,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) store double %v, ptr addrspace(1) %out ret void @@ -4917,6 +6511,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_fi_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out ret void @@ -4958,6 +6564,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_vss_fi_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b32 s4, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX13-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) store float %v, ptr addrspace(1) %out ret void @@ -5047,6 +6665,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i64 %v, ptr addrspace(1) %out ret void @@ -5136,6 +6782,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) store double %v, ptr addrspace(1) %out ret void @@ -5176,6 +6850,19 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_tid_tid_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -5217,6 +6904,19 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_tid_tid_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %v = call float @llvm.amdgcn.permlane16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) @@ -5306,6 +7006,34 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) @@ -5399,6 +7127,36 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -5442,6 +7200,19 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_undef_tid_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -5484,6 +7255,19 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_undef_tid_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -5572,6 +7356,34 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -5666,6 +7478,36 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -5753,6 +7595,33 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -5837,6 +7706,33 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %v = call float @llvm.amdgcn.permlane16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) @@ -5928,6 +7824,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %v = call i64 @llvm.amdgcn.permlane16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) @@ -6029,6 +7955,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -6072,6 +8030,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_i_tid_fi_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) @@ -6114,6 +8085,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_i_tid_fi_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -6202,6 +8186,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -6296,6 +8308,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -6340,6 +8382,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_i_tid_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) @@ -6382,6 +8437,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_i_tid_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -6470,6 +8538,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -6564,6 +8660,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -6608,6 +8734,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) @@ -6650,6 +8789,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -6738,6 +8890,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -6832,6 +9012,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -6876,6 +9086,19 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_tid_tid_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -6917,6 +9140,19 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_tid_tid_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %v = call float @llvm.amdgcn.permlanex16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) @@ -7006,6 +9242,34 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) @@ -7099,6 +9363,36 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -7142,6 +9436,19 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_undef_tid_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -7184,6 +9491,19 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_undef_tid_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -7272,6 +9592,34 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -7366,6 +9714,36 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -7453,6 +9831,33 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -7537,6 +9942,33 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %v = call float @llvm.amdgcn.permlanex16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false) @@ -7628,6 +10060,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v3, 0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false) @@ -7729,6 +10191,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -7772,6 +10266,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false) @@ -7814,6 +10321,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -7902,6 +10422,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -7996,6 +10544,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -8040,6 +10618,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_i_tid_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true) @@ -8082,6 +10673,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_i_tid_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -8170,6 +10774,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -8264,6 +10896,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -8308,6 +10970,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true) @@ -8350,6 +11025,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %undef = freeze float poison @@ -8438,6 +11126,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 %undef = freeze i64 poison @@ -8532,6 +11248,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-SDAG-NEXT: s_endpgm +; +; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 nv +; GFX13-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX13-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX13-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_f32 = bitcast i32 %tidx to float %tidx_f64 = fpext float %tidx_f32 to double @@ -8575,6 +11321,20 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlane16_half: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) store half %v, ptr addrspace(1) %out ret void @@ -8614,6 +11374,20 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3 ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlanex16_half: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false) store half %v, ptr addrspace(1) %out ret void @@ -8653,6 +11427,20 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlane16_bfloat: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) store bfloat %v, ptr addrspace(1) %out ret void @@ -8692,6 +11480,20 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1 ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlanex16_bfloat: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false) store bfloat %v, ptr addrspace(1) %out ret void @@ -8731,6 +11533,20 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 % ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlane16_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i16 %v, ptr addrspace(1) %out ret void @@ -8770,6 +11586,20 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlanex16_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i16 %v, ptr addrspace(1) %out ret void @@ -8809,6 +11639,20 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlane16_v2f16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b32 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <2 x half> %v, ptr addrspace(1) %out ret void @@ -8848,6 +11692,20 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlanex16_v2f16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b32 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <2 x half> %v, ptr addrspace(1) %out ret void @@ -8927,6 +11785,36 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane16_v2f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane16_v2f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <2 x float> %v, ptr addrspace(1) %out ret void @@ -9006,6 +11894,36 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlanex16_v2f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlanex16_v2f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <2 x float> %v, ptr addrspace(1) %out ret void @@ -9125,6 +12043,50 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane16_v7i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane16_v7i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <7 x i32> %v, ptr addrspace(1) %out ret void @@ -9244,6 +12206,50 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlanex16_v7i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlanex16_v7i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <7 x i32> %v, ptr addrspace(1) %out ret void @@ -9292,6 +12298,23 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlane16_v8i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <8 x i16> %v, ptr addrspace(1) %out ret void @@ -9340,6 +12363,23 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-LABEL: v_permlanex16_v8i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-NEXT: s_set_pc_i64 s[30:31] %v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <8 x i16> %v, ptr addrspace(1) %out ret void @@ -9431,6 +12471,40 @@ define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src ; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane16_v2i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane16_v2i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <2 x i64> @llvm.amdgcn.permlane16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <2 x i64> %v, ptr addrspace(1) %out ret void @@ -9544,6 +12618,48 @@ define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane16_v3i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane16_v3i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <3 x i64> @llvm.amdgcn.permlane16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <3 x i64> %v, ptr addrspace(1) %out ret void @@ -9669,6 +12785,52 @@ define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 % ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane16_v4f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane16_v4f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <4 x double> @llvm.amdgcn.permlane16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <4 x double> %v, ptr addrspace(1) %out ret void @@ -9854,6 +13016,72 @@ define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 % ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane16_v8f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x3 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane16_v8f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v10, v10, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v11, v11, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v12, v12, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v13, v13, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v14, v14, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v15, v15, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v16, v16, s0, s1 +; GFX13-GISEL-NEXT: v_permlane16_b32 v17, v17, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x3 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <8 x double> @llvm.amdgcn.permlane16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <8 x double> %v, ptr addrspace(1) %out ret void @@ -9945,6 +13173,40 @@ define void @v_permlanex16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %sr ; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlanex16_v2i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlanex16_v2i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <2 x i64> @llvm.amdgcn.permlanex16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <2 x i64> %v, ptr addrspace(1) %out ret void @@ -10058,6 +13320,48 @@ define void @v_permlanex16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %sr ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlanex16_v3i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlanex16_v3i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <3 x i64> @llvm.amdgcn.permlanex16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <3 x i64> %v, ptr addrspace(1) %out ret void @@ -10183,6 +13487,52 @@ define void @v_permlanex16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlanex16_v4f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlanex16_v4f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <4 x double> @llvm.amdgcn.permlanex16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <4 x double> %v, ptr addrspace(1) %out ret void @@ -10368,6 +13718,72 @@ define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 ; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlanex16_v8f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x3 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlanex16_v8f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v10, v10, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v11, v11, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v12, v12, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v13, v13, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v14, v14, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v15, v15, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v16, v16, s0, s1 +; GFX13-GISEL-NEXT: v_permlanex16_b32 v17, v17, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x3 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %v = call <8 x double> @llvm.amdgcn.permlanex16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false) store <8 x double> %v, ptr addrspace(1) %out ret void @@ -10430,6 +13846,25 @@ define amdgpu_kernel void @v_permlanex16_convergent(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: .LBB142_2: ; %f ; GFX12-NEXT: s_endpgm +; +; GFX13-LABEL: v_permlanex16_convergent: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mov_b32_e32 v1, s0 +; GFX13-NEXT: s_mov_b32 s0, exec_lo +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlanex16_b32 v1, v1, s1, s2 +; GFX13-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX13-NEXT: s_cbranch_execz .LBB142_2 +; GFX13-NEXT: ; %bb.1: ; %t +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; GFX13-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-NEXT: .LBB142_2: ; %f +; GFX13-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %pattern_lo, i32 %pattern_hi, i1 false, i1 false) %select = icmp eq i32 %tidx, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll new file mode 100644 index 0000000000000..0290764b9fe00 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll @@ -0,0 +1,1105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s + +define i32 @v_permlane_up_b32_vss(i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_up_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_b32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) + ret i32 %v +} + +define i32 @v_permlane_up_b32_vii(i32 %src0) { +; GFX1250-LABEL: v_permlane_up_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_b32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 1, i32 2) + ret i32 %v +} + +define i32 @v_permlane_up_b32_vll(i32 %src0) { +; GFX1250-LABEL: v_permlane_up_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_b32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 100, i32 102) + ret i32 %v +} + +define i32 @v_permlane_up_b32_vvv(i32 %src0) { +; GFX1250-LABEL: v_permlane_up_b32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_b32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %tidx, i32 %tidy) + ret i32 %v +} + +define float @v_permlane_up_f32_vss(float %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_up_f32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 %src1, i32 %src2) + ret float %v +} + +define float @v_permlane_up_f32_vii(float %src0) { +; GFX1250-LABEL: v_permlane_up_f32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 1, i32 2) + ret float %v +} + +define float @v_permlane_up_f32_vll(float %src0) { +; GFX1250-LABEL: v_permlane_up_f32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 100, i32 102) + ret float %v +} + +define float @v_permlane_up_f32_vvv(float %src0) { +; GFX1250-LABEL: v_permlane_up_f32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 %tidx, i32 %tidy) + ret float %v +} + +define i64 @v_permlane_up_i64_vss(i64 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_up_i64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_i64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 %src1, i32 %src2) + ret i64 %v +} + +define i64 @v_permlane_up_i64_vii(i64 %src0) { +; GFX1250-LABEL: v_permlane_up_i64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_i64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 1, i32 2) + ret i64 %v +} + +define i64 @v_permlane_up_i64_vll(i64 %src0) { +; GFX1250-LABEL: v_permlane_up_i64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_i64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 100, i32 102) + ret i64 %v +} + +define i64 @v_permlane_up_i64_vvv(i64 %src0) { +; GFX1250-LABEL: v_permlane_up_i64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_i64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 %tidx, i32 %tidy) + ret i64 %v +} + +define double @v_permlane_up_f64_vss(double %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_up_f64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 %src1, i32 %src2) + ret double %v +} + +define double @v_permlane_up_f64_vii(double %src0) { +; GFX1250-LABEL: v_permlane_up_f64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_up_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 1, i32 2) + ret double %v +} + +define double @v_permlane_up_f64_vll(double %src0) { +; GFX1250-LABEL: v_permlane_up_f64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 100, i32 102) + ret double %v +} + +define double @v_permlane_up_f64_vvv(double %src0) { +; GFX1250-LABEL: v_permlane_up_f64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_f64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_up_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 %tidx, i32 %tidy) + ret double %v +} + +; does not work for GISEL +;define void @v_permlane_up_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) { +; %v = call bfloat @llvm.amdgcn.permlane.up.bf16(bfloat %src, i32 %src1, i32 %src2) +; store bfloat %v, ptr addrspace(1) %out, align 4 +; ret void +;} + +define void @v_permlane_up_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_up_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane.up.i16(i16 %src, i32 %src1, i32 %src2) + store i16 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_up_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_up_v2f16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b32 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane.up.v2f16(<2 x half> %src, i32 %src1, i32 %src2) + store <2 x half> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_up_v2f32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_up_v2f32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_up_v2f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_up_v2f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane.up.v2f32(<2 x float> %src, i32 %src1, i32 %src2) + store <2 x float> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_up_v7i32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_up_v7i32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_up_v7i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_up_v7i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane.up.v7i32(<7 x i32> %src, i32 %src1, i32 %src2) + store <7 x i32> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_up_v8i16: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_up_v8i16: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_up_v8i16: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_up_v8i16: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane.up.v8i16(<8 x i16> %src, i32 %src1, i32 %src2) + store <8 x i16> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_up_v2i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_up_v2i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_up_v2i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_up_v2i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane.up.v2i64(<2 x i64> %src, i32 %src1, i32 %src2) + store <2 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_up_v3i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_up_v3i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_up_v3i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_up_v3i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane.up.v2i64(<3 x i64> %src, i32 %src1, i32 %src2) + store <3 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_up_v4f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_up_v4f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_up_v4f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_up_v4f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane.up.v4f64(<4 x double> %src, i32 %src1, i32 %src2) + store <4 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_up_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_up_v8f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v17, v17, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v16, v16, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v15, v15, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v14, v14, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v13, v13, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v12, v12, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v11, v11, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v10, v10, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x3 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_up_v8f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v10, v10, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v11, v11, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v12, v12, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v13, v13, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v14, v14, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v15, v15, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v16, v16, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_up_b32 v17, v17, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x3 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_up_v8f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_up_b32 v17, v17, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v16, v16, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v15, v15, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v14, v14, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v13, v13, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v12, v12, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v11, v11, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v10, v10, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x3 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_up_v8f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_up_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v10, v10, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v11, v11, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v12, v12, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v13, v13, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v14, v14, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v15, v15, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v16, v16, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_up_b32 v17, v17, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x3 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane.up.v8f64(<8 x double> %src, i32 %src1, i32 %src2) + store <8 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.xor.ll new file mode 100644 index 0000000000000..476f2894c29b2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.xor.ll @@ -0,0 +1,1105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s + +define i32 @v_permlane_xor_b32_vss(i32 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_xor_b32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_b32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) + ret i32 %v +} + +define i32 @v_permlane_xor_b32_vii(i32 %src0) { +; GFX1250-LABEL: v_permlane_xor_b32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_b32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 1, i32 2) + ret i32 %v +} + +define i32 @v_permlane_xor_b32_vll(i32 %src0) { +; GFX1250-LABEL: v_permlane_xor_b32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_b32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 100, i32 102) + ret i32 %v +} + +define i32 @v_permlane_xor_b32_vvv(i32 %src0) { +; GFX1250-LABEL: v_permlane_xor_b32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_b32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %tidx, i32 %tidy) + ret i32 %v +} + +define float @v_permlane_xor_f32_vss(float %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_xor_f32_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f32_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 %src1, i32 %src2) + ret float %v +} + +define float @v_permlane_xor_f32_vii(float %src0) { +; GFX1250-LABEL: v_permlane_xor_f32_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f32_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 1, i32 2) + ret float %v +} + +define float @v_permlane_xor_f32_vll(float %src0) { +; GFX1250-LABEL: v_permlane_xor_f32_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f32_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 100, i32 102) + ret float %v +} + +define float @v_permlane_xor_f32_vvv(float %src0) { +; GFX1250-LABEL: v_permlane_xor_f32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f32_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v2, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v1 +; GFX13-NEXT: v_readfirstlane_b32 s1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 %tidx, i32 %tidy) + ret float %v +} + +define i64 @v_permlane_xor_i64_vss(i64 %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_xor_i64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_i64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 %src1, i32 %src2) + ret i64 %v +} + +define i64 @v_permlane_xor_i64_vii(i64 %src0) { +; GFX1250-LABEL: v_permlane_xor_i64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_i64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 1, i32 2) + ret i64 %v +} + +define i64 @v_permlane_xor_i64_vll(i64 %src0) { +; GFX1250-LABEL: v_permlane_xor_i64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_i64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 100, i32 102) + ret i64 %v +} + +define i64 @v_permlane_xor_i64_vvv(i64 %src0) { +; GFX1250-LABEL: v_permlane_xor_i64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_i64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 %tidx, i32 %tidy) + ret i64 %v +} + +define double @v_permlane_xor_f64_vss(double %src0, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_xor_f64_vss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f64_vss: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 %src1, i32 %src2) + ret double %v +} + +define double @v_permlane_xor_f64_vii(double %src0) { +; GFX1250-LABEL: v_permlane_xor_f64_vii: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, 1, 2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f64_vii: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, 1, 2 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, 1, 2 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 1, i32 2) + ret double %v +} + +define double @v_permlane_xor_f64_vll(double %src0) { +; GFX1250-LABEL: v_permlane_xor_f64_vll: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_movk_i32 s0, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, s0, 0x66 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f64_vll: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x64 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, 0x66 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, s0, 0x66 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 100, i32 102) + ret double %v +} + +define double @v_permlane_xor_f64_vvv(double %src0) { +; GFX1250-LABEL: v_permlane_xor_f64_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX1250-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX1250-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_f64_vvv: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX13-NEXT: v_bfe_u32 v3, v31, 10, 10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_readfirstlane_b32 s0, v2 +; GFX13-NEXT: v_readfirstlane_b32 s1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v0, v0, s0, s1 +; GFX13-NEXT: v_permlane_xor_b32 v1, v1, s0, s1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 %tidx, i32 %tidy) + ret double %v +} + +; does not work for GISEL +;define void @v_permlane_xor_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) { +; %v = call bfloat @llvm.amdgcn.permlane.xor.bf16(bfloat %src, i32 %src1, i32 %src2) +; store bfloat %v, ptr addrspace(1) %out, align 4 +; ret void +;} + +define void @v_permlane_xor_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_xor_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b16 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane.xor.i16(i16 %src, i32 %src1, i32 %src2) + store i16 %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) { +; GFX1250-LABEL: v_permlane_xor_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-NEXT: global_store_b32 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_permlane_xor_v2f16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_readfirstlane_b32 s0, v3 +; GFX13-NEXT: v_readfirstlane_b32 s1, v4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-NEXT: global_store_b32 v[0:1], v2, off +; GFX13-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane.xor.v2f16(<2 x half> %src, i32 %src1, i32 %src2) + store <2 x half> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_xor_v2f32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_xor_v2f32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_xor_v2f32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_xor_v2f32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v4 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v5 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane.xor.v2f32(<2 x float> %src, i32 %src1, i32 %src2) + store <2 x float> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_xor_v7i32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_xor_v7i32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_xor_v7i32: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_xor_v7i32: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v9 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v10 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane.xor.v7i32(<7 x i32> %src, i32 %src1, i32 %src2) + store <7 x i32> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_xor_v8i16: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_xor_v8i16: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_xor_v8i16: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_xor_v8i16: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane.xor.v8i16(<8 x i16> %src, i32 %src1, i32 %src2) + store <8 x i16> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_xor_v2i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_xor_v2i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_xor_v2i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_xor_v2i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v6 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v7 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <2 x i64> @llvm.amdgcn.permlane.xor.v2i64(<2 x i64> %src, i32 %src1, i32 %src2) + store <2 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_xor_v3i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_xor_v3i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_xor_v3i64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_xor_v3i64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v8 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v9 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <3 x i64> @llvm.amdgcn.permlane.xor.v2i64(<3 x i64> %src, i32 %src1, i32 %src2) + store <3 x i64> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_xor_v4f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_xor_v4f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_xor_v4f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x1 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_xor_v4f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v10 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v11 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x1 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <4 x double> @llvm.amdgcn.permlane.xor.v4f64(<4 x double> %src, i32 %src1, i32 %src2) + store <4 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} + +define void @v_permlane_xor_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) { +; GFX1250-SDAG-LABEL: v_permlane_xor_v8f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v17, v17, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v16, v16, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v15, v15, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v14, v14, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v13, v13, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v12, v12, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v11, v11, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v10, v10, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-SDAG-NEXT: s_clause 0x3 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: v_permlane_xor_v8f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v10, v10, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v11, v11, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v12, v12, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v13, v13, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v14, v14, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v15, v15, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v16, v16, s0, s1 +; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v17, v17, s0, s1 +; GFX1250-GISEL-NEXT: s_clause 0x3 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: v_permlane_xor_v8f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-SDAG-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v17, v17, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v16, v16, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v15, v15, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v14, v14, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v13, v13, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v12, v12, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v11, v11, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v10, v10, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-SDAG-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-SDAG-NEXT: s_clause 0x3 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: v_permlane_xor_v8f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s0, v18 +; GFX13-GISEL-NEXT: v_readfirstlane_b32 s1, v19 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v2, v2, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v3, v3, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v4, v4, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v5, v5, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v6, v6, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v7, v7, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v8, v8, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v9, v9, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v10, v10, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v11, v11, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v12, v12, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v13, v13, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v14, v14, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v15, v15, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v16, v16, s0, s1 +; GFX13-GISEL-NEXT: v_permlane_xor_b32 v17, v17, s0, s1 +; GFX13-GISEL-NEXT: s_clause 0x3 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 +; GFX13-GISEL-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] + %v = call <8 x double> @llvm.amdgcn.permlane.xor.v8f64(<8 x double> %src, i32 %src1, i32 %src2) + store <8 x double> %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll index 20b450d383df4..4adc317a66c8d 100644 --- a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll @@ -48,7 +48,7 @@ define i32 @v_known_signbits_smed3(i16 %a, i16 %b) { ; SI-SDAG-NEXT: s_movk_i32 s4, 0xffe0 ; SI-SDAG-NEXT: v_med3_i32 v0, v0, s4, 64 ; SI-SDAG-NEXT: v_cvt_f32_i32_e32 v3, v0 -; SI-SDAG-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; SI-SDAG-NEXT: v_rcp_f32_e32 v4, v2 ; SI-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; SI-SDAG-NEXT: v_or_b32_e32 v0, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 03163c7b5ed15..57981d1763959 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -64,7 +64,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6 ; GFX9-NEXT: v_lshl_add_u32 v6, v4, 2, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX9-NEXT: v_rcp_f32_e32 v7, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v9, v17, v12 ; GFX9-NEXT: s_mov_b64 s[10:11], 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-ds-bvh-stack-pre-ra.ll b/llvm/test/CodeGen/AMDGPU/optimize-ds-bvh-stack-pre-ra.ll new file mode 100644 index 0000000000000..be351ea026a03 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/optimize-ds-bvh-stack-pre-ra.ll @@ -0,0 +1,300 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) { +; CHECK-LABEL: test_ds_bvh_stack_push4_pop1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6 +; CHECK-NEXT: v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21 +; CHECK-NEXT: v_mov_b32_e32 v31, v20 +; CHECK-NEXT: image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3] +; CHECK-NEXT: v_cmpx_eq_f32_e32 0, v20 +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %if +; CHECK-NEXT: global_load_b64 v[6:7], v[12:13], off +; CHECK-NEXT: global_load_b64 v[34:35], v[14:15], off +; CHECK-NEXT: global_load_b64 v[36:37], v[16:17], off +; CHECK-NEXT: global_load_b64 v[38:39], v[18:19], off +; CHECK-NEXT: s_wait_loadcnt 0x2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v7, v35 +; CHECK-NEXT: s_wait_loadcnt 0x1 +; CHECK-NEXT: v_add3_u32 v6, v6, v34, v36 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add3_u32 v1, v1, v37, v39 +; CHECK-NEXT: v_add3_u32 v1, v6, v38, v1 +; CHECK-NEXT: .LBB0_2: ; %end +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[21:24] +; CHECK-NEXT: image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: global_store_b32 v[12:13], v1, off +; CHECK-NEXT: global_store_b32 v[14:15], v0, off +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: global_store_b32 v[16:17], v20, off +; CHECK-NEXT: global_store_b32 v[18:19], v21, off +; CHECK-NEXT: s_endpgm +entry: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %val.0 = extractelement <10 x i32> %a, i32 0 + %val.1 = extractelement <10 x i32> %a, i32 1 + %val.2 = extractelement <10 x i32> %a, i32 2 + %val.3 = extractelement <10 x i32> %a, i32 3 + %bvh.0 = insertelement <4 x i32> poison, i32 %val.0, i32 0 + %bvh.1 = insertelement <4 x i32> %bvh.0, i32 %val.1, i32 1 + %bvh.2 = insertelement <4 x i32> %bvh.1, i32 %val.2, i32 2 + %bvh = insertelement <4 x i32> %bvh.2, i32 %val.3, i32 3 + %cnd = fcmp oeq float %ray_origin_x, 0.0 + br i1 %cnd, label %if, label %end + +if: + ; loads to force vgpr pressure + %load.0 = load <2 x i32>, ptr addrspace(1) %p.0 + %load.1 = load <2 x i32>, ptr addrspace(1) %p.1 + %load.2 = load <2 x i32>, ptr addrspace(1) %p.2 + %load.3 = load <2 x i32>, ptr addrspace(1) %p.3 + %add.0 = add <2 x i32> %load.0, %load.1 + %add.1 = add <2 x i32> %add.0, %load.2 + %add.2 = add <2 x i32> %add.1, %load.3 + %.i0 = extractelement <2 x i32> %add.2, i32 0 + %.i1 = extractelement <2 x i32> %add.2, i32 1 + %data.1 = add i32 %.i0, %.i1 + br label %end + +end: + %data = phi i32 [ %data.0, %entry ], [ %data.1, %if ] + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data, <4 x i32> %bvh, i32 0) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + + ; keep all intersect ray parameters live + %new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + %new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + %v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr) + %b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0 + %c = extractelement <10 x i32> %b, i32 0 + %d = extractelement <10 x i32> %b, i32 1 + + ; stores keep pointers live + store i32 %vdst, ptr addrspace(1) %p.0 + store i32 %newaddr, ptr addrspace(1) %p.1 + store i32 %c, ptr addrspace(1) %p.2 + store i32 %d, ptr addrspace(1) %p.3 + + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6 +; CHECK-NEXT: v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21 +; CHECK-NEXT: v_mov_b32_e32 v31, v20 +; CHECK-NEXT: image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3] +; CHECK-NEXT: v_cmpx_eq_f32_e32 0, v20 +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: ; %bb.1: ; %if +; CHECK-NEXT: global_load_b64 v[6:7], v[12:13], off +; CHECK-NEXT: global_load_b64 v[34:35], v[14:15], off +; CHECK-NEXT: global_load_b64 v[36:37], v[16:17], off +; CHECK-NEXT: global_load_b64 v[38:39], v[18:19], off +; CHECK-NEXT: s_wait_loadcnt 0x2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v7, v35 +; CHECK-NEXT: s_wait_loadcnt 0x1 +; CHECK-NEXT: v_add3_u32 v6, v6, v34, v36 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add3_u32 v1, v1, v37, v39 +; CHECK-NEXT: v_add3_u32 v1, v6, v38, v1 +; CHECK-NEXT: .LBB1_2: ; %end +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[21:28] +; CHECK-NEXT: image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: global_store_b32 v[12:13], v1, off +; CHECK-NEXT: global_store_b32 v[14:15], v0, off +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: global_store_b32 v[16:17], v20, off +; CHECK-NEXT: global_store_b32 v[18:19], v21, off +; CHECK-NEXT: s_endpgm +entry: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %val.0 = extractelement <10 x i32> %a, i32 0 + %val.1 = extractelement <10 x i32> %a, i32 1 + %val.2 = extractelement <10 x i32> %a, i32 2 + %val.3 = extractelement <10 x i32> %a, i32 3 + %val.4 = extractelement <10 x i32> %a, i32 4 + %val.5 = extractelement <10 x i32> %a, i32 5 + %val.6 = extractelement <10 x i32> %a, i32 6 + %val.7 = extractelement <10 x i32> %a, i32 7 + %bvh.0 = insertelement <8 x i32> poison, i32 %val.0, i32 0 + %bvh.1 = insertelement <8 x i32> %bvh.0, i32 %val.1, i32 1 + %bvh.2 = insertelement <8 x i32> %bvh.1, i32 %val.2, i32 2 + %bvh.3 = insertelement <8 x i32> %bvh.2, i32 %val.3, i32 3 + %bvh.4 = insertelement <8 x i32> %bvh.3, i32 %val.4, i32 4 + %bvh.5 = insertelement <8 x i32> %bvh.4, i32 %val.5, i32 5 + %bvh.6 = insertelement <8 x i32> %bvh.5, i32 %val.6, i32 6 + %bvh = insertelement <8 x i32> %bvh.6, i32 %val.7, i32 7 + %cnd = fcmp oeq float %ray_origin_x, 0.0 + br i1 %cnd, label %if, label %end + +if: + ; loads to force vgpr pressure + %load.0 = load <2 x i32>, ptr addrspace(1) %p.0 + %load.1 = load <2 x i32>, ptr addrspace(1) %p.1 + %load.2 = load <2 x i32>, ptr addrspace(1) %p.2 + %load.3 = load <2 x i32>, ptr addrspace(1) %p.3 + %add.0 = add <2 x i32> %load.0, %load.1 + %add.1 = add <2 x i32> %add.0, %load.2 + %add.2 = add <2 x i32> %add.1, %load.3 + %.i0 = extractelement <2 x i32> %add.2, i32 0 + %.i1 = extractelement <2 x i32> %add.2, i32 1 + %data.1 = add i32 %.i0, %.i1 + br label %end + +end: + %data = phi i32 [ %data.0, %entry ], [ %data.1, %if ] + %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data, <8 x i32> %bvh, i32 0) + %vdst = extractvalue { i32, i32 } %pair, 0 + %newaddr = extractvalue { i32, i32 } %pair, 1 + + ; keep all intersect ray parameters live + %new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + %new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + %v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr) + %b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0 + %c = extractelement <10 x i32> %b, i32 0 + %d = extractelement <10 x i32> %b, i32 1 + + ; stores keep pointers live + store i32 %vdst, ptr addrspace(1) %p.0 + store i32 %newaddr, ptr addrspace(1) %p.1 + store i32 %c, ptr addrspace(1) %p.2 + store i32 %d, ptr addrspace(1) %p.3 + + ret void +} + +define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) { +; CHECK-LABEL: test_ds_bvh_stack_push8_pop2: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6 +; CHECK-NEXT: v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21 +; CHECK-NEXT: v_mov_b32_e32 v31, v20 +; CHECK-NEXT: image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3] +; CHECK-NEXT: v_cmpx_eq_f32_e32 0, v20 +; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: ; %bb.1: ; %if +; CHECK-NEXT: global_load_b64 v[6:7], v[12:13], off +; CHECK-NEXT: global_load_b64 v[34:35], v[14:15], off +; CHECK-NEXT: global_load_b64 v[36:37], v[16:17], off +; CHECK-NEXT: global_load_b64 v[38:39], v[18:19], off +; CHECK-NEXT: s_wait_loadcnt 0x2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v7, v35 +; CHECK-NEXT: s_wait_loadcnt 0x1 +; CHECK-NEXT: v_add3_u32 v6, v6, v34, v36 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add3_u32 v1, v1, v37, v39 +; CHECK-NEXT: v_add3_u32 v1, v6, v38, v1 +; CHECK-NEXT: .LBB2_2: ; %end +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[6:7], v0, v1, v[21:28] +; CHECK-NEXT: image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3] +; CHECK-NEXT: s_wait_dscnt 0x0 +; CHECK-NEXT: global_store_b64 v[12:13], v[6:7], off +; CHECK-NEXT: global_store_b32 v[14:15], v0, off +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: global_store_b32 v[16:17], v20, off +; CHECK-NEXT: global_store_b32 v[18:19], v21, off +; CHECK-NEXT: s_endpgm +entry: + %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr) + %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0 + %val.0 = extractelement <10 x i32> %a, i32 0 + %val.1 = extractelement <10 x i32> %a, i32 1 + %val.2 = extractelement <10 x i32> %a, i32 2 + %val.3 = extractelement <10 x i32> %a, i32 3 + %val.4 = extractelement <10 x i32> %a, i32 4 + %val.5 = extractelement <10 x i32> %a, i32 5 + %val.6 = extractelement <10 x i32> %a, i32 6 + %val.7 = extractelement <10 x i32> %a, i32 7 + %bvh.0 = insertelement <8 x i32> poison, i32 %val.0, i32 0 + %bvh.1 = insertelement <8 x i32> %bvh.0, i32 %val.1, i32 1 + %bvh.2 = insertelement <8 x i32> %bvh.1, i32 %val.2, i32 2 + %bvh.3 = insertelement <8 x i32> %bvh.2, i32 %val.3, i32 3 + %bvh.4 = insertelement <8 x i32> %bvh.3, i32 %val.4, i32 4 + %bvh.5 = insertelement <8 x i32> %bvh.4, i32 %val.5, i32 5 + %bvh.6 = insertelement <8 x i32> %bvh.5, i32 %val.6, i32 6 + %bvh = insertelement <8 x i32> %bvh.6, i32 %val.7, i32 7 + %cnd = fcmp oeq float %ray_origin_x, 0.0 + br i1 %cnd, label %if, label %end + +if: + ; loads to force vgpr pressure + %load.0 = load <2 x i32>, ptr addrspace(1) %p.0 + %load.1 = load <2 x i32>, ptr addrspace(1) %p.1 + %load.2 = load <2 x i32>, ptr addrspace(1) %p.2 + %load.3 = load <2 x i32>, ptr addrspace(1) %p.3 + %add.0 = add <2 x i32> %load.0, %load.1 + %add.1 = add <2 x i32> %add.0, %load.2 + %add.2 = add <2 x i32> %add.1, %load.3 + %.i0 = extractelement <2 x i32> %add.2, i32 0 + %.i1 = extractelement <2 x i32> %add.2, i32 1 + %data.1 = add i32 %.i0, %.i1 + br label %end + +end: + %data = phi i32 [ %data.0, %entry ], [ %data.1, %if ] + %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data, <8 x i32> %bvh, i32 0) + %vdst = extractvalue { i64, i32 } %pair, 0 + %newaddr = extractvalue { i64, i32 } %pair, 1 + + ; keep all intersect ray parameters live + %new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1 + %new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2 + %v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr) + %b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0 + %c = extractelement <10 x i32> %b, i32 0 + %d = extractelement <10 x i32> %b, i32 1 + + ; stores keep pointers live + store i64 %vdst, ptr addrspace(1) %p.0 + store i32 %newaddr, ptr addrspace(1) %p.1 + store i32 %c, ptr addrspace(1) %p.2 + store i32 %d, ptr addrspace(1) %p.3 + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index d21cda572f5f4..b54d80a03abd0 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -1503,12 +1503,12 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 +; GFX10-NEXT: v_rcp_f32_e32 v15, v1 +; GFX10-NEXT: v_rcp_f32_e32 v16, v10 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 +; GFX10-NEXT: v_rcp_f32_e32 v17, v12 ; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX10-NEXT: v_rcp_f32_e32 v18, v14 ; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 @@ -1581,10 +1581,10 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 +; GFX9-NEXT: v_rcp_f32_e32 v15, v2 +; GFX9-NEXT: v_rcp_f32_e32 v16, v12 +; GFX9-NEXT: v_rcp_f32_e32 v17, v13 +; GFX9-NEXT: v_rcp_f32_e32 v18, v4 ; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 ; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15 @@ -1856,11 +1856,11 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3 +; GFX10-NEXT: v_rcp_f32_e32 v17, v2 +; GFX10-NEXT: v_rcp_f32_e32 v18, v13 +; GFX10-NEXT: v_rcp_f32_e32 v19, v3 ; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15 +; GFX10-NEXT: v_rcp_f32_e32 v20, v15 ; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 @@ -1938,7 +1938,7 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v3 +; GFX9-NEXT: v_rcp_f32_e32 v17, v3 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 @@ -1946,8 +1946,8 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_trunc_f32_e32 v17, v17 ; GFX9-NEXT: v_mad_f32 v19, -v17, v3, v10 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v3| -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v14 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v10 +; GFX9-NEXT: v_rcp_f32_e32 v3, v14 +; GFX9-NEXT: v_rcp_f32_e32 v19, v10 ; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX9-NEXT: v_mul_f32_e32 v3, v13, v3 @@ -1960,7 +1960,7 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_mad_f32 v19, -v15, v10, v16 ; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v19|, |v10| -; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v16 +; GFX9-NEXT: v_rcp_f32_e32 v10, v16 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 ; GFX9-NEXT: v_xor_b32_sdwa v19, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 @@ -2221,11 +2221,11 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v14, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 +; GFX10-NEXT: v_rcp_f32_e32 v10, v1 +; GFX10-NEXT: v_rcp_f32_e32 v11, v3 +; GFX10-NEXT: v_rcp_f32_e32 v13, v9 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v15, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 +; GFX10-NEXT: v_rcp_f32_e32 v12, v4 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x40207 ; GFX10-NEXT: v_mul_f32_e32 v10, v14, v10 ; GFX10-NEXT: v_mul_f32_e32 v11, v4, v11 @@ -2274,18 +2274,18 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ; GFX9-NEXT: s_mov_b32 s4, 0x40207 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2 +; GFX9-NEXT: v_rcp_f32_e32 v11, v2 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3 +; GFX9-NEXT: v_rcp_f32_e32 v12, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10 +; GFX9-NEXT: v_rcp_f32_e32 v13, v10 ; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4 ; GFX9-NEXT: v_trunc_f32_e32 v11, v11 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4 +; GFX9-NEXT: v_rcp_f32_e32 v14, v4 ; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12 ; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 @@ -2409,10 +2409,10 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 +; GFX10-NEXT: v_rcp_f32_e32 v10, v1 +; GFX10-NEXT: v_rcp_f32_e32 v11, v3 +; GFX10-NEXT: v_rcp_f32_e32 v12, v4 +; GFX10-NEXT: v_rcp_f32_e32 v13, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2 @@ -2470,15 +2470,15 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: s_mov_b32 s4, 0x2050505 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 +; GFX9-NEXT: v_rcp_f32_e32 v15, v2 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v3 +; GFX9-NEXT: v_rcp_f32_e32 v16, v3 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v11, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v11 +; GFX9-NEXT: v_rcp_f32_e32 v17, v11 ; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v14, v4 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX9-NEXT: v_rcp_f32_e32 v18, v14 ; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 ; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 diff --git a/llvm/test/CodeGen/AMDGPU/pr155452.ll b/llvm/test/CodeGen/AMDGPU/pr155452.ll index 928997e9fecb8..b51b6ee07aea5 100644 --- a/llvm/test/CodeGen/AMDGPU/pr155452.ll +++ b/llvm/test/CodeGen/AMDGPU/pr155452.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @my_kernel(i64 %foo, i32 %bar) { ; CHECK-NEXT: s_abs_i32 s7, s0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s7 ; CHECK-NEXT: s_sub_i32 s0, 0, s7 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_rcp_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_lo_u32 v3, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index 56bb3ce1742b8..6efd25289deec 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -762,26 +762,17 @@ define amdgpu_cs half @srcmods_neg_f16(half inreg %src) { ret half %result } -; TODO: SelectionDAG should avoid generating v_rcp_iflag_f32. define amdgpu_cs float @fdiv_f32_i32(float inreg %a, i32 inreg %b) { -; GFX12-SDAG-LABEL: fdiv_f32_i32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_cvt_f32_u32 s1, s1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX12-SDAG-NEXT: v_rcp_iflag_f32_e32 v0, s1 -; GFX12-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: fdiv_f32_i32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_cvt_f32_u32 s1, s1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX12-GISEL-NEXT: v_s_rcp_f32 s1, s1 -; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: fdiv_f32_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_cvt_f32_u32 s1, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_s_rcp_f32 s1, s1 +; GFX12-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog ; ; GCN-GISEL-LABEL: fdiv_f32_i32: ; GCN-GISEL: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll index db3c902ec2416..e941186541642 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll @@ -1,8 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s -; GCN-LABEL: {{^}}rcp_uint: -; GCN: v_rcp_iflag_f32_e32 define amdgpu_kernel void @rcp_uint(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +; GCN-LABEL: rcp_uint: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %in, align 4 %cvt = uitofp i32 %load to float %div = fdiv float 1.000000e+00, %cvt, !fpmath !0 @@ -10,9 +23,21 @@ define amdgpu_kernel void @rcp_uint(ptr addrspace(1) %in, ptr addrspace(1) %out) ret void } -; GCN-LABEL: {{^}}rcp_sint: -; GCN: v_rcp_iflag_f32_e32 define amdgpu_kernel void @rcp_sint(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +; GCN-LABEL: rcp_sint: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %in, align 4 %cvt = sitofp i32 %load to float %div = fdiv float 1.000000e+00, %cvt, !fpmath !0 @@ -20,9 +45,31 @@ define amdgpu_kernel void @rcp_sint(ptr addrspace(1) %in, ptr addrspace(1) %out) ret void } -; GCN-LABEL: {{^}}rcp_uint_denorm: -; GCN-NOT: v_rcp_iflag_f32 define amdgpu_kernel void @rcp_uint_denorm(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 { +; GCN-LABEL: rcp_uint_denorm: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 +; GCN-NEXT: v_rcp_f32_e32 v2, v1 +; GCN-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GCN-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GCN-NEXT: v_mul_f32_e32 v4, v3, v2 +; GCN-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GCN-NEXT: v_fma_f32 v4, v5, v2, v4 +; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %in, align 4 %cvt = uitofp i32 %load to float %div = fdiv float 1.000000e+00, %cvt @@ -30,9 +77,31 @@ define amdgpu_kernel void @rcp_uint_denorm(ptr addrspace(1) %in, ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}rcp_sint_denorm: -; GCN-NOT: v_rcp_iflag_f32 define amdgpu_kernel void @rcp_sint_denorm(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 { +; GCN-LABEL: rcp_sint_denorm: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 +; GCN-NEXT: v_rcp_f32_e32 v2, v1 +; GCN-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GCN-NEXT: v_fma_f32 v2, v3, v2, v2 +; GCN-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GCN-NEXT: v_mul_f32_e32 v4, v3, v2 +; GCN-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GCN-NEXT: v_fma_f32 v4, v5, v2, v4 +; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %in, align 4 %cvt = sitofp i32 %load to float %div = fdiv float 1.000000e+00, %cvt diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 441509ba01f64..b690879dab99b 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-NEXT: v_max_i32_e32 v5, v0, v5 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 @@ -77,7 +77,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 ; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_rcp_f32_e32 v3, v3 ; TONGA-NEXT: v_max_i32_e32 v5, v0, v5 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v0 @@ -122,7 +122,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_sub_i32 s7, 0, s5 ; GFX9-NEXT: s_xor_b32 s4, s6, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: s_abs_i32 s6, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 @@ -412,9 +412,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GCN-NEXT: v_rcp_f32_e32 v6, v6 ; GCN-NEXT: v_max_i32_e32 v0, v0, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9 +; GCN-NEXT: v_rcp_f32_e32 v5, v9 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 ; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 @@ -483,9 +483,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2 ; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3 ; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; TONGA-NEXT: v_rcp_f32_e32 v6, v6 ; TONGA-NEXT: v_max_i32_e32 v0, v0, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9 +; TONGA-NEXT: v_rcp_f32_e32 v5, v9 ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 ; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 @@ -549,7 +549,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: s_xor_b32 s0, s5, s0 ; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-NEXT: s_sub_i32 s0, 0, s1 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_readfirstlane_b32 s4, v3 @@ -575,7 +575,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_xor_b32 s5, s5, s6 ; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s5, s5, s6 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 @@ -812,13 +812,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4 ; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 ; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GCN-NEXT: v_rcp_f32_e32 v10, v10 ; GCN-NEXT: v_max_i32_e32 v5, v5, v13 ; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5 ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 ; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13 +; GCN-NEXT: v_rcp_f32_e32 v13, v13 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 ; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 @@ -840,7 +840,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GCN-NEXT: v_mul_lo_u32 v13, v10, v4 ; GCN-NEXT: v_mul_hi_u32 v12, v1, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; GCN-NEXT: v_rcp_f32_e32 v9, v15 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 @@ -866,7 +866,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 ; GCN-NEXT: v_mul_hi_u32 v4, v9, v4 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 ; GCN-NEXT: v_max_i32_e32 v2, v2, v9 @@ -939,13 +939,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4 ; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 ; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; TONGA-NEXT: v_rcp_f32_e32 v10, v10 ; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 ; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5 ; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 ; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 ; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13 +; TONGA-NEXT: v_rcp_f32_e32 v13, v13 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 ; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 ; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 @@ -967,7 +967,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12 ; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4 ; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 +; TONGA-NEXT: v_rcp_f32_e32 v9, v15 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 @@ -993,7 +993,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 ; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 ; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 @@ -1063,7 +1063,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9-NEXT: s_xor_b32 s0, s5, s0 ; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s0, 0, s1 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_xor_b32 s1, s1, s6 ; GFX9-NEXT: s_sub_i32 s10, 0, s5 ; GFX9-NEXT: s_sub_i32 s6, s1, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: v_readfirstlane_b32 s8, v5 ; GFX9-NEXT: s_xor_b32 s4, s8, s4 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_xor_b32 s5, s5, s4 ; GFX9-NEXT: s_sub_i32 s11, 0, s8 ; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s10, v6 ; GFX9-NEXT: s_xor_b32 s7, s10, s7 ; GFX9-NEXT: s_abs_i32 s10, s10 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_readfirstlane_b32 s10, v7 ; GFX9-NEXT: s_xor_b32 s5, s5, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-NEXT: s_xor_b32 s4, s10, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s8 ; GFX9-NEXT: s_sub_i32 s5, s5, s7 @@ -1467,7 +1467,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v1, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -1500,7 +1500,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v1 ; TONGA-NEXT: v_xor_b32_e32 v0, v1, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; TONGA-NEXT: v_rcp_f32_e32 v4, v2 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 ; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 ; TONGA-NEXT: v_trunc_f32_e32 v1, v1 @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -1620,7 +1620,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_or_b32_e32 v0, v3, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 @@ -1663,7 +1663,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_or_b32_e32 v0, v3, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; TONGA-NEXT: v_rcp_f32_e32 v4, v2 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 @@ -1706,7 +1706,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v2 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v3, v3, v4 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-NEXT: v_rcp_f32_e32 v4, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_or_b32_e32 v3, v3, v4 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v3 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; TONGA-NEXT: v_rcp_f32_e32 v4, v1 ; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 @@ -1889,7 +1889,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX9-NEXT: v_rcp_f32_e32 v4, v1 ; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 @@ -1980,7 +1980,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 ; GCN-NEXT: v_max_i32_e32 v5, v0, v5 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 @@ -2027,7 +2027,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_rcp_f32_e32 v3, v3 ; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 ; TONGA-NEXT: v_max_i32_e32 v5, v0, v5 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 @@ -2074,7 +2074,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: s_sub_i32 s7, 0, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: s_bfe_i32 s6, s6, 0x190000 ; GFX9-NEXT: s_xor_b32 s4, s6, s4 ; GFX9-NEXT: s_abs_i32 s6, s6 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 50f6acf3f85a2..68466abf31aa0 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -492,7 +492,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i32 s1, s3, 8 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s2, s0, 1 @@ -523,7 +523,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i32 s1, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: s_or_b32 s2, s0, 1 @@ -554,7 +554,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 @@ -581,7 +581,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2 @@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_abs_i32 s0, s3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_xor_b32 s1, s3, s8 ; GCN-NEXT: s_ashr_i32 s1, s1, 31 @@ -660,7 +660,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_abs_i32 s0, s3 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_xor_b32 s1, s3, s8 ; GCN-IR-NEXT: s_ashr_i32 s1, s1, 31 @@ -708,7 +708,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s3, 1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -752,7 +752,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s3, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -805,7 +805,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_ashr_i32 s1, s3, 9 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s0, s1, s0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s2, s0, 1 @@ -836,7 +836,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_ashr_i32 s1, s3, 9 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s0, s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: s_or_b32 s2, s0, 1 @@ -872,7 +872,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s3, 7 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -916,7 +916,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s3, 7 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -967,7 +967,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_ashr_i32 s5, s9, 8 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GCN-NEXT: s_xor_b32 s4, s5, s4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: s_ashr_i32 s6, s11, 8 ; GCN-NEXT: s_ashr_i32 s7, s15, 8 @@ -984,7 +984,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 ; GCN-NEXT: s_xor_b32 s4, s6, s7 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_ashr_i32 s5, s9, 8 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GCN-IR-NEXT: s_xor_b32 s4, s5, s4 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: s_ashr_i32 s6, s11, 8 ; GCN-IR-NEXT: s_ashr_i32 s7, s15, 8 @@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 ; GCN-IR-NEXT: s_xor_b32 s4, s6, s7 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-IR-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1069,7 +1069,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: s_sext_i32_i16 s3, s3 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s0, s2, s0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_or_b32 s2, s0, 1 @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s0, s2, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: s_or_b32 s2, s0, 1 @@ -1834,7 +1834,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_or_b32 s3, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 @@ -1861,7 +1861,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-NEXT: s_mov_b32 s2, 0x41c00000 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_or_b32 s3, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 @@ -1949,7 +1949,7 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) { ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 @@ -1969,7 +1969,7 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 @@ -1994,7 +1994,7 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v3, -v2, v0, s4 @@ -2014,7 +2014,7 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v3, -v2, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll index cee4dac6afc27..abfb56a58221f 100644 --- a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}sdiv24_i8: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -24,7 +24,7 @@ define amdgpu_kernel void @sdiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; FUNC-LABEL: {{^}}sdiv24_i16: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -43,7 +43,7 @@ define amdgpu_kernel void @sdiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; FUNC-LABEL: {{^}}sdiv24_i32: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -65,7 +65,6 @@ define amdgpu_kernel void @sdiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; FUNC-LABEL: {{^}}sdiv25_i32: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -84,7 +83,6 @@ define amdgpu_kernel void @sdiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -103,7 +101,6 @@ define amdgpu_kernel void @test_no_sdiv24_i32_1(ptr addrspace(1) %out, ptr addrs ; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -123,7 +120,7 @@ define amdgpu_kernel void @test_no_sdiv24_i32_2(ptr addrspace(1) %out, ptr addrs ; FUNC-LABEL: {{^}}srem24_i8: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -142,7 +139,7 @@ define amdgpu_kernel void @srem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; FUNC-LABEL: {{^}}srem24_i16: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -161,7 +158,7 @@ define amdgpu_kernel void @srem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; FUNC-LABEL: {{^}}srem24_i32: ; SI: v_cvt_f32_i32 ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_cvt_i32_f32 ; EG: INT_TO_FLT @@ -183,7 +180,6 @@ define amdgpu_kernel void @srem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; FUNC-LABEL: {{^}}no_srem25_i32: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -202,7 +198,6 @@ define amdgpu_kernel void @no_srem25_i32(ptr addrspace(1) %out, ptr addrspace(1) ; FUNC-LABEL: {{^}}no_sdiv25_i24_i25_i32: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -221,7 +216,6 @@ define amdgpu_kernel void @no_sdiv25_i24_i25_i32(ptr addrspace(1) %out, ptr addr ; FUNC-LABEL: {{^}}no_sdiv25_i25_i24_i32: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -240,7 +234,6 @@ define amdgpu_kernel void @no_sdiv25_i25_i24_i32(ptr addrspace(1) %out, ptr addr ; FUNC-LABEL: {{^}}no_srem25_i24_i25_i32: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -259,7 +252,6 @@ define amdgpu_kernel void @no_srem25_i24_i25_i32(ptr addrspace(1) %out, ptr addr ; FUNC-LABEL: {{^}}no_srem25_i25_i24_i32: ; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 ; EG-NOT: INT_TO_FLT ; EG-NOT: RECIP_IEEE @@ -278,7 +270,7 @@ define amdgpu_kernel void @no_srem25_i25_i24_i32(ptr addrspace(1) %out, ptr addr ; FUNC-LABEL: {{^}}srem25_i24_i11_i32: ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24 ; EG: INT_TO_FLT @@ -298,7 +290,7 @@ define amdgpu_kernel void @srem25_i24_i11_i32(ptr addrspace(1) %out, ptr addrspa ; FUNC-LABEL: {{^}}srem25_i11_i24_i32: ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24 ; EG: INT_TO_FLT @@ -318,7 +310,7 @@ define amdgpu_kernel void @srem25_i11_i24_i32(ptr addrspace(1) %out, ptr addrspa ; FUNC-LABEL: {{^}}srem25_i17_i12_i32: ; SI: v_cvt_f32_i32 -; SI: v_rcp_iflag_f32 +; SI: v_rcp_f32 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 17 ; EG: INT_TO_FLT diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 90345993de473..c870d651eb1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -124,7 +124,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_readfirstlane_b32 s3, v0 ; GCN-NEXT: s_sub_i32 s5, 0, s2 ; GCN-NEXT: s_ashr_i32 s4, s3, 31 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_abs_i32 s3, s3 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -165,7 +165,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: s_abs_i32 s2, s2 ; TAHITI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; TAHITI-NEXT: s_sub_i32 s3, 0, s2 -; TAHITI-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; TAHITI-NEXT: v_rcp_f32_e32 v1, v1 ; TAHITI-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; TAHITI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; TAHITI-NEXT: v_mul_lo_u32 v2, s3, v1 @@ -202,7 +202,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: s_abs_i32 s2, s2 ; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s2 ; TONGA-NEXT: s_sub_i32 s3, 0, s2 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; TONGA-NEXT: v_rcp_f32_e32 v1, v1 ; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 ; TONGA-NEXT: v_mul_lo_u32 v2, s3, v1 @@ -470,7 +470,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_sub_i32 s6, 0, s2 ; GCN-NEXT: s_ashr_i32 s5, s4, 31 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: s_abs_i32 s4, s4 ; GCN-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b32 s2, s2, s5 ; GCN-NEXT: s_sub_i32 s7, 0, s3 ; GCN-NEXT: s_sub_i32 s2, s2, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: s_ashr_i32 s6, s4, 31 ; GCN-NEXT: s_abs_i32 s4, s4 @@ -536,7 +536,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_cvt_f32_u32_e32 v2, s0 ; TAHITI-NEXT: s_sub_i32 s1, 0, s0 ; TAHITI-NEXT: v_readfirstlane_b32 s7, v3 -; TAHITI-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; TAHITI-NEXT: v_rcp_f32_e32 v2, v2 ; TAHITI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; TAHITI-NEXT: v_cvt_u32_f32_e32 v2, v2 ; TAHITI-NEXT: v_mul_lo_u32 v4, s1, v2 @@ -560,7 +560,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: s_sub_i32 s0, 0, s7 ; TAHITI-NEXT: s_mov_b32 s1, s5 ; TAHITI-NEXT: s_xor_b32 s6, s6, s8 -; TAHITI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TAHITI-NEXT: v_rcp_f32_e32 v0, v0 ; TAHITI-NEXT: s_sub_i32 s6, s6, s8 ; TAHITI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; TAHITI-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -602,7 +602,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_sub_i32 s3, 0, s2 ; TONGA-NEXT: v_readfirstlane_b32 s5, v3 ; TONGA-NEXT: v_mov_b32_e32 v3, s1 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; TONGA-NEXT: v_rcp_f32_e32 v2, v2 ; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 ; TONGA-NEXT: v_mul_lo_u32 v4, s3, v2 @@ -628,7 +628,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_sub_i32 s5, 0, s4 ; TONGA-NEXT: s_abs_i32 s1, s0 ; TONGA-NEXT: s_xor_b32 s2, s2, s3 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 ; TONGA-NEXT: s_sub_i32 s2, s2, s3 ; TONGA-NEXT: s_ashr_i32 s0, s0, 31 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -855,7 +855,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v5 ; GCN-NEXT: s_ashr_i32 s5, s4, 31 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_abs_i32 s4, s4 ; GCN-NEXT: v_readfirstlane_b32 s3, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -878,7 +878,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b32 s2, s2, s5 ; GCN-NEXT: s_sub_i32 s8, 0, s3 ; GCN-NEXT: s_sub_i32 s2, s2, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s6, v6 ; GCN-NEXT: s_ashr_i32 s7, s6, 31 ; GCN-NEXT: s_abs_i32 s6, s6 @@ -903,7 +903,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b32 s3, s3, s7 ; GCN-NEXT: s_sub_i32 s9, 0, s4 ; GCN-NEXT: s_sub_i32 s3, s3, s7 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s6, v7 ; GCN-NEXT: s_ashr_i32 s8, s6, 31 ; GCN-NEXT: s_abs_i32 s6, s6 @@ -928,7 +928,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_readfirstlane_b32 s6, v8 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_ashr_i32 s2, s6, 31 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-NEXT: v_rcp_f32_e32 v3, v2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: s_abs_i32 s3, s6 ; GCN-NEXT: s_sub_i32 s6, 0, s5 @@ -974,7 +974,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_cvt_f32_u32_e32 v0, s0 ; TAHITI-NEXT: s_sub_i32 s1, 0, s0 ; TAHITI-NEXT: v_readfirstlane_b32 s7, v1 -; TAHITI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TAHITI-NEXT: v_rcp_f32_e32 v0, v0 ; TAHITI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; TAHITI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; TAHITI-NEXT: v_mul_lo_u32 v8, s1, v0 @@ -999,7 +999,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: s_sub_i32 s7, 0, s6 ; TAHITI-NEXT: v_readfirstlane_b32 s8, v5 ; TAHITI-NEXT: s_abs_i32 s9, s8 -; TAHITI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TAHITI-NEXT: v_rcp_f32_e32 v0, v0 ; TAHITI-NEXT: s_xor_b32 s0, s0, s1 ; TAHITI-NEXT: s_sub_i32 s10, s0, s1 ; TAHITI-NEXT: s_ashr_i32 s8, s8, 31 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: s_sub_i32 s6, 0, s1 ; TAHITI-NEXT: v_readfirstlane_b32 s7, v6 ; TAHITI-NEXT: s_abs_i32 s9, s7 -; TAHITI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TAHITI-NEXT: v_rcp_f32_e32 v0, v0 ; TAHITI-NEXT: s_xor_b32 s0, s0, s8 ; TAHITI-NEXT: s_sub_i32 s8, s0, s8 ; TAHITI-NEXT: s_ashr_i32 s7, s7, 31 @@ -1049,7 +1049,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: s_sub_i32 s1, 0, s6 ; TAHITI-NEXT: s_mov_b32 s0, s4 ; TAHITI-NEXT: v_readfirstlane_b32 s4, v7 -; TAHITI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TAHITI-NEXT: v_rcp_f32_e32 v0, v0 ; TAHITI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; TAHITI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; TAHITI-NEXT: v_mov_b32_e32 v0, s10 @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s2 ; TONGA-NEXT: s_sub_i32 s3, 0, s2 ; TONGA-NEXT: v_readfirstlane_b32 s5, v1 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 ; TONGA-NEXT: v_mul_lo_u32 v8, s3, v0 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_sub_i32 s5, 0, s4 ; TONGA-NEXT: v_readfirstlane_b32 s6, v5 ; TONGA-NEXT: s_abs_i32 s7, s6 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 ; TONGA-NEXT: s_xor_b32 s2, s2, s3 ; TONGA-NEXT: s_sub_i32 s2, s2, s3 ; TONGA-NEXT: s_ashr_i32 s6, s6, 31 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_sub_i32 s5, 0, s4 ; TONGA-NEXT: v_readfirstlane_b32 s7, v6 ; TONGA-NEXT: s_abs_i32 s8, s7 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 ; TONGA-NEXT: s_xor_b32 s3, s3, s6 ; TONGA-NEXT: s_sub_i32 s3, s3, s6 ; TONGA-NEXT: s_ashr_i32 s7, s7, 31 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_abs_i32 s5, s5 ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s5 ; TONGA-NEXT: s_sub_i32 s0, 0, s5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v0 ; TONGA-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 8880bc9bb2057..82196b73b66e4 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -479,7 +479,7 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_xor_b32 s0, s3, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_or_b32 s8, s0, 1 @@ -515,7 +515,7 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_xor_b32 s0, s3, s2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_or_b32 s8, s0, 1 @@ -558,7 +558,7 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_xor_b32 s0, s3, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_or_b32 s8, s0, 1 @@ -594,7 +594,7 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_xor_b32 s0, s3, s2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_or_b32 s8, s0, 1 @@ -631,7 +631,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { ; GCN-NEXT: v_ashrrev_i32_e32 v1, 8, v1 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GCN-NEXT: v_xor_b32_e32 v5, v1, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 30, v5 ; GCN-NEXT: v_or_b32_e32 v5, 1, v5 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -655,7 +655,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 8, v1 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v5, v1, v0 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-IR-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 30, v5 ; GCN-IR-NEXT: v_or_b32_e32 v5, 1, v5 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -689,7 +689,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -778,7 +778,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -819,7 +819,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -865,7 +865,7 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_sub_i32 s0, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -904,7 +904,7 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: s_sext_i32_i16 s3, s3 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s3, s2, s4 ; GCN-NEXT: s_ashr_i32 s3, s3, 30 ; GCN-NEXT: s_or_b32 s3, s3, 1 @@ -1251,7 +1251,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 ; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 ; GCN-IR-NEXT: s_or_b32 s3, s3, 1 @@ -1991,7 +1991,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: s_or_b32 s3, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -2023,7 +2023,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: s_or_b32 s3, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 @@ -2125,7 +2125,7 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) { ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-NEXT: v_rcp_f32_e32 v3, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x41c00000, v3 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 @@ -2147,7 +2147,7 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_rcp_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x41c00000, v3 ; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 @@ -2174,7 +2174,7 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-NEXT: v_rcp_f32_e32 v3, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x47000000, v3 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mad_f32 v4, -v3, v2, s4 @@ -2196,7 +2196,7 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-IR-NEXT: v_or_b32_e32 v1, 1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GCN-IR-NEXT: v_rcp_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v3, 0x47000000, v3 ; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-IR-NEXT: v_mad_f32 v4, -v3, v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index dd2acb8de6f41..c0918dd78be5f 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -22,7 +22,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_u32_e32 v2, v1 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; SI-NEXT: v_rcp_f32_e32 v2, v2 ; SI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 ; SI-NEXT: v_mul_lo_u32 v3, v3, v2 @@ -58,7 +58,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_u32_e32 v2, v1 ; VI-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; VI-NEXT: v_rcp_f32_e32 v2, v2 ; VI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 ; VI-NEXT: v_mul_lo_u32 v3, v3, v2 @@ -91,7 +91,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 @@ -124,7 +124,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX1030-NEXT: v_readfirstlane_b32 s5, v0 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX1030-NEXT: s_sub_i32 s4, 0, s2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1030-NEXT: v_rcp_f32_e32 v1, v1 ; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030-NEXT: v_readfirstlane_b32 s3, v1 @@ -194,7 +194,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 ; SI-NEXT: s_sub_i32 s2, 0, s5 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s2, v0 @@ -226,7 +226,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; VI-NEXT: v_cvt_f32_u32_e32 v0, s5 ; VI-NEXT: s_sub_i32 s2, 0, s5 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s2, v0 @@ -259,7 +259,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 @@ -290,7 +290,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX1030-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v0 @@ -365,8 +365,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_u32_e32 v4, v2 ; SI-NEXT: v_cvt_f32_u32_e32 v5, v3 ; SI-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; SI-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; SI-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; SI-NEXT: v_rcp_f32_e32 v4, v4 +; SI-NEXT: v_rcp_f32_e32 v5, v5 ; SI-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; SI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; SI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 @@ -420,8 +420,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_u32_e32 v4, v2 ; VI-NEXT: v_cvt_f32_u32_e32 v5, v3 ; VI-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 -; VI-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; VI-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; VI-NEXT: v_rcp_f32_e32 v4, v4 +; VI-NEXT: v_rcp_f32_e32 v5, v5 ; VI-NEXT: v_sub_u32_e32 v7, vcc, 0, v3 ; VI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; VI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 @@ -471,8 +471,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GCN-NEXT: v_rcp_f32_e32 v4, v4 +; GCN-NEXT: v_rcp_f32_e32 v5, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v6, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 @@ -525,8 +525,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX1030-NEXT: v_rcp_f32_e32 v2, v2 +; GFX1030-NEXT: v_rcp_f32_e32 v3, v3 ; GFX1030-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v3 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -643,10 +643,10 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_u32_e32 v10, v1 ; SI-NEXT: v_cvt_f32_u32_e32 v12, v2 ; SI-NEXT: v_cvt_f32_u32_e32 v14, v3 -; SI-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; SI-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; SI-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; SI-NEXT: v_rcp_iflag_f32_e32 v14, v14 +; SI-NEXT: v_rcp_f32_e32 v8, v8 +; SI-NEXT: v_rcp_f32_e32 v10, v10 +; SI-NEXT: v_rcp_f32_e32 v12, v12 +; SI-NEXT: v_rcp_f32_e32 v14, v14 ; SI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; SI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 ; SI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 @@ -738,10 +738,10 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_u32_e32 v10, v1 ; VI-NEXT: v_cvt_f32_u32_e32 v12, v2 ; VI-NEXT: v_cvt_f32_u32_e32 v14, v3 -; VI-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; VI-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; VI-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; VI-NEXT: v_rcp_iflag_f32_e32 v14, v14 +; VI-NEXT: v_rcp_f32_e32 v8, v8 +; VI-NEXT: v_rcp_f32_e32 v10, v10 +; VI-NEXT: v_rcp_f32_e32 v12, v12 +; VI-NEXT: v_rcp_f32_e32 v14, v14 ; VI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; VI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 ; VI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 @@ -836,10 +836,10 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v14, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v16, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 +; GCN-NEXT: v_rcp_f32_e32 v10, v10 +; GCN-NEXT: v_rcp_f32_e32 v12, v12 +; GCN-NEXT: v_rcp_f32_e32 v14, v14 +; GCN-NEXT: v_rcp_f32_e32 v16, v16 ; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 @@ -929,8 +929,8 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX1030-NEXT: s_sub_i32 s6, 0, s2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1030-NEXT: v_rcp_f32_e32 v0, v0 +; GFX1030-NEXT: v_rcp_f32_e32 v1, v1 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -939,7 +939,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX1030-NEXT: v_readfirstlane_b32 s9, v1 ; GFX1030-NEXT: s_mul_i32 s6, s6, s4 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX1030-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030-NEXT: s_mul_hi_u32 s6, s4, s6 ; GFX1030-NEXT: s_add_i32 s4, s4, s6 ; GFX1030-NEXT: s_mul_hi_u32 s4, s7, s4 @@ -965,7 +965,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s9 ; GFX1030-NEXT: v_readfirstlane_b32 s10, v0 ; GFX1030-NEXT: s_mul_i32 s8, s6, s3 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1030-NEXT: v_rcp_f32_e32 v1, v1 ; GFX1030-NEXT: s_sub_i32 s7, s7, s8 ; GFX1030-NEXT: s_add_i32 s8, s6, 1 ; GFX1030-NEXT: s_sub_i32 s9, s7, s3 @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; SI-NEXT: v_rcp_f32_e32 v2, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -1433,7 +1433,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -1460,7 +1460,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; GCN-NEXT: v_rcp_f32_e32 v4, v3 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v4, v2, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_cvt_f32_ubyte1_e32 v2, v1 ; GFX1030-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX1030-NEXT: v_rcp_f32_e32 v3, v2 ; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 ; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1 @@ -1549,7 +1549,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; SI-NEXT: v_rcp_f32_e32 v2, v0 ; SI-NEXT: v_mul_f32_e32 v2, v1, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1604,7 +1604,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 @@ -1624,7 +1624,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX1030-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX1030-NEXT: v_rcp_f32_e32 v3, v2 ; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 ; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1 @@ -1700,7 +1700,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; SI-NEXT: v_rcp_f32_e32 v2, v0 ; SI-NEXT: v_mul_f32_e32 v2, v1, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1737,7 +1737,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v1, v3, v1 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1783,7 +1783,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1815,7 +1815,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX1030-NEXT: v_rcp_f32_e32 v3, v1 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 @@ -1900,7 +1900,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; SI-NEXT: v_rcp_f32_e32 v2, v0 ; SI-NEXT: v_mul_f32_e32 v2, v1, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1937,7 +1937,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v1, v3, v1 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1983,7 +1983,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: v_rcp_f32_e32 v4, v2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2015,7 +2015,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX1030-NEXT: v_rcp_f32_e32 v3, v1 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 @@ -2373,7 +2373,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_i32_e32 v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v1, v0 -; SI-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; SI-NEXT: v_rcp_f32_e32 v4, v2 ; SI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; SI-NEXT: v_or_b32_e32 v0, 1, v0 ; SI-NEXT: v_mul_f32_e32 v1, v3, v4 @@ -2402,7 +2402,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_i32_e32 v3, v1 ; VI-NEXT: v_xor_b32_e32 v0, v1, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; VI-NEXT: v_or_b32_e32 v0, 1, v0 ; VI-NEXT: v_mul_f32_e32 v1, v3, v4 @@ -2432,7 +2432,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; GCN-NEXT: v_cvt_f32_i32_e32 v4, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_i32_e32 v5, v3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-NEXT: v_rcp_f32_e32 v6, v4 ; GCN-NEXT: v_xor_b32_e32 v2, v3, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 ; GCN-NEXT: v_or_b32_e32 v2, 1, v2 @@ -2457,7 +2457,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; GFX1030-NEXT: global_load_sbyte v3, v[0:1], off ; GFX1030-NEXT: s_waitcnt vmcnt(1) ; GFX1030-NEXT: v_cvt_f32_i32_e32 v4, v2 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; GFX1030-NEXT: v_rcp_f32_e32 v5, v4 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_cvt_f32_i32_e32 v6, v3 ; GFX1030-NEXT: v_xor_b32_e32 v2, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index c2bd180fedcca..edd84a5f09e5e 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -405,7 +405,7 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_lshr_b32 s2, s3, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -430,7 +430,7 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -458,7 +458,7 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -476,7 +476,7 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -501,7 +501,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_sub_i32 s0, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -537,7 +537,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -581,7 +581,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_sub_i32 s0, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -666,7 +666,7 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s2, s3, 9 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -691,7 +691,7 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 9 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -726,7 +726,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: s_and_b32 s2, s2, 0xff000000 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -754,7 +754,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_and_b32 s2, s2, 0xff000000 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1467,7 +1467,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1541,7 +1541,7 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) { ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1558,7 +1558,7 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1580,7 +1580,7 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1597,7 +1597,7 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index eaab3531824c4..c6aec64d1692e 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -44,7 +44,7 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: s_sub_i32 s0, 0, s8 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -82,7 +82,7 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX8-NEXT: s_sub_i32 s0, 0, s6 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_rcp_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -171,8 +171,8 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_sub_i32 s6, 0, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -218,8 +218,8 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX8-NEXT: s_sub_i32 s6, 0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_rcp_f32_e32 v0, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -340,8 +340,8 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX6-NEXT: s_sub_i32 s0, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: s_cselect_b32 s6, s1, s0 ; GFX6-NEXT: s_sub_i32 s0, 0, s13 ; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 @@ -380,7 +380,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: s_cselect_b32 s7, s1, s0 ; GFX6-NEXT: s_sub_i32 s0, 0, s14 ; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 @@ -425,8 +425,8 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: s_sub_i32 s0, 0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_rcp_f32_e32 v0, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -447,7 +447,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: s_cselect_b32 s2, s1, s0 ; GFX8-NEXT: s_sub_i32 s0, 0, s13 ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 @@ -465,7 +465,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: s_cselect_b32 s3, s1, s0 ; GFX8-NEXT: s_sub_i32 s0, 0, s14 ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s10, v0 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll index 935a9bf23c9cb..4d9ece813c402 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -22,7 +22,7 @@ define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -125,7 +125,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out, ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out, ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -228,7 +228,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -331,7 +331,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -357,7 +357,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 @@ -434,7 +434,7 @@ define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -462,7 +462,7 @@ define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -531,7 +531,7 @@ define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b32 s5, s5, 0x7fffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -553,7 +553,7 @@ define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_and_b32 s2, s2, 0x7fffff ; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 @@ -622,7 +622,7 @@ define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b32 s5, s5, 0xffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -644,7 +644,7 @@ define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_and_b32 s2, s2, 0xffffff ; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 @@ -716,7 +716,7 @@ define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addr ; SI-NEXT: s_and_b32 s5, s5, 0xffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -738,7 +738,7 @@ define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_and_b32 s2, s2, 0x7fffff ; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 @@ -810,7 +810,7 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr ; SI-NEXT: s_and_b32 s5, s5, 0x7fffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -832,7 +832,7 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_and_b32 s2, s2, 0xffffff ; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 @@ -891,7 +891,6 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr ret void } -; RCP_IFLAG is for URECIP in the full 32b alg define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv25_i32: ; SI: ; %bb.0: @@ -905,7 +904,7 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b32 s5, s5, 0x1ffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 ; SI-NEXT: s_sub_i32 s6, 0, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -939,7 +938,7 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_sub_i32 s3, 0, s4 ; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s3, v0 @@ -1010,7 +1009,6 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; RCP_IFLAG is for URECIP in the full 32b alg define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_no_udiv24_i32_1: ; SI: ; %bb.0: @@ -1024,7 +1022,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_and_b32 s5, s5, 0x1ffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 ; SI-NEXT: s_sub_i32 s6, 0, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -1058,7 +1056,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_sub_i32 s3, 0, s4 ; VI-NEXT: s_and_b32 s5, s2, 0xffffff ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s3, v0 @@ -1129,7 +1127,6 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs ret void } -; RCP_IFLAG is for URECIP in the full 32b alg define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_no_udiv24_i32_2: ; SI: ; %bb.0: @@ -1143,7 +1140,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_and_b32 s5, s5, 0xffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s5 ; SI-NEXT: s_sub_i32 s6, 0, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -1177,7 +1174,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_sub_i32 s3, 0, s4 ; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s3, v0 @@ -1267,7 +1264,7 @@ define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: v_mul_f32_e32 v4, v2, v4 ; SI-NEXT: v_trunc_f32_e32 v4, v4 ; SI-NEXT: v_fma_f32 v2, -v4, v3, v2 @@ -1295,7 +1292,7 @@ define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3 @@ -1376,7 +1373,7 @@ define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_u32_e32 v2, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_u32_e32 v3, v1 -; SI-NEXT: v_rcp_iflag_f32_e32 v4, v3 +; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: v_mul_f32_e32 v4, v2, v4 ; SI-NEXT: v_trunc_f32_e32 v4, v4 ; SI-NEXT: v_fma_f32 v2, -v4, v3, v2 @@ -1406,7 +1403,7 @@ define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_u32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_u32_e32 v3, v1 -; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 ; VI-NEXT: v_trunc_f32_e32 v4, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v4 @@ -1479,7 +1476,7 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b32 s7, s5, 0xffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s6 ; SI-NEXT: v_cvt_f32_u32_e32 v1, s7 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -1504,7 +1501,7 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_and_b32 s2, s4, 0xffffff ; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1562,7 +1559,6 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; RCP_IFLAG is for URECIP in the full 32b alg define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: urem25_i32: ; SI: ; %bb.0: @@ -1575,7 +1571,7 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b32 s4, s5, 0x1ffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: s_sub_i32 s5, 0, s4 -; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s5, v0 @@ -1607,7 +1603,7 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_sub_i32 s3, 0, s4 ; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s3, v0 @@ -1673,7 +1669,6 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; RCP_IFLAG is for URECIP in the full 32b alg define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_no_urem24_i32_1: ; SI: ; %bb.0: @@ -1686,7 +1681,7 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_and_b32 s4, s5, 0x1ffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: s_sub_i32 s5, 0, s4 -; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s5, v0 @@ -1718,7 +1713,7 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_sub_i32 s3, 0, s4 ; VI-NEXT: s_and_b32 s5, s2, 0xffffff ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s3, v0 @@ -1784,7 +1779,6 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs ret void } -; RCP_IFLAG is for URECIP in the full 32b alg define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_no_urem24_i32_2: ; SI: ; %bb.0: @@ -1797,7 +1791,7 @@ define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrs ; SI-NEXT: s_and_b32 s4, s5, 0xffffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: s_sub_i32 s5, 0, s4 -; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s5, v0 @@ -1829,7 +1823,7 @@ define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrs ; VI-NEXT: s_sub_i32 s3, 0, s4 ; VI-NEXT: s_and_b32 s5, s2, 0x1ffffff ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s3, v0 @@ -1908,7 +1902,7 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr ad ; SI-NEXT: s_and_b32 s5, s5, 0x7fffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -1930,7 +1924,7 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 @@ -1999,7 +1993,7 @@ define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr ad ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: v_cvt_f32_u32_e32 v0, s4 ; SI-NEXT: v_cvt_f32_u32_e32 v1, s5 -; SI-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -2021,7 +2015,7 @@ define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_and_b32 s2, s2, 0x7fffff ; VI-NEXT: v_cvt_f32_u32_e32 v1, s2 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2 ; VI-NEXT: v_trunc_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index bed95d73e9961..3bb489c654535 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -445,7 +445,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_sub_i32 s0, 0, s8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -480,7 +480,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 @@ -523,9 +523,9 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_sub_i32 s1, 0, s0 ; GCN-NEXT: s_lshr_b32 s6, s15, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_lshr_b32 s7, s11, 1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 @@ -578,9 +578,9 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 ; GCN-IR-NEXT: s_lshr_b32 s6, s15, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: s_lshr_b32 s7, s11, 1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 @@ -641,7 +641,7 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s5, s3, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -666,7 +666,7 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s5, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -699,9 +699,9 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-NEXT: s_lshr_b32 s4, s15, 9 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GCN-NEXT: s_lshr_b32 s5, s11, 9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GCN-NEXT: v_rcp_f32_e32 v3, v1 ; GCN-NEXT: s_sub_i32 s8, 0, s6 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -746,9 +746,9 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-IR-NEXT: s_lshr_b32 s4, s15, 9 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GCN-IR-NEXT: s_lshr_b32 s5, s11, 9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GCN-IR-NEXT: v_rcp_f32_e32 v3, v1 ; GCN-IR-NEXT: s_sub_i32 s8, 0, s6 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: s_lshr_b32 s4, s3, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1414,7 +1414,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; GCN-IR-NEXT: s_lshr_b32 s4, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GCN-IR-NEXT: v_rcp_f32_e32 v1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 @@ -1496,7 +1496,7 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) { ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_rcp_f32_e32 v2, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1515,7 +1515,7 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1539,7 +1539,7 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-NEXT: v_rcp_f32_e32 v2, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -1558,7 +1558,7 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GCN-IR-NEXT: v_rcp_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index f207aac9421a4..445ffdd4c2629 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2458,7 +2458,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 -; GFX1032-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1032-NEXT: v_rcp_f32_e32 v1, v1 ; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1 @@ -2492,7 +2492,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 -; GFX1064-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1064-NEXT: v_rcp_f32_e32 v1, v1 ; GFX1064-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_mul_lo_u32 v2, s1, v1 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 -; GFX1032-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1032-NEXT: v_rcp_f32_e32 v1, v1 ; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1 @@ -2799,7 +2799,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 -; GFX1064-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1064-NEXT: v_rcp_f32_e32 v1, v1 ; GFX1064-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_mul_lo_u32 v2, s1, v1 diff --git a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll index 827570e7311c7..6a3acf36bb1f2 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll @@ -217,3 +217,25 @@ define <8 x i64> @test_unsigned_v8i64_v8f32(<8 x float> %f) nounwind { %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> %f) ret <8 x i64> %x } + +; VCVTTBF162IUBS +define <32 x i8> @test_unsigned_v32i8_v32bf16(<32 x bfloat> %f) nounwind { +; CHECK-LABEL: test_unsigned_v32i8_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttbf162iubs %zmm0, %zmm0 +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <32 x i8> @llvm.fptoui.sat.v32i8.v32bf16(<32 x bfloat> %f) + ret <32 x i8> %x +} + +; VCVTTBF162IBS +define <32 x i8> @test_signed_v32i8_v32bf16(<32 x bfloat> %f) nounwind { +; CHECK-LABEL: test_signed_v32i8_v32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttbf162ibs %zmm0, %zmm0 +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <32 x i8> @llvm.fptosi.sat.v32i8.v32bf16(<32 x bfloat> %f) + ret <32 x i8> %x +} diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll index 3d79457eb2a8a..1950de32cc975 100644 --- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll +++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll @@ -674,3 +674,47 @@ define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind { %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f) ret <4 x i64> %x } + +; VCVTTBF162IUBS +define <8 x i8> @test_unsigned_v8i8_v8bf16(<8 x bfloat> %f) nounwind { +; CHECK-LABEL: test_unsigned_v8i8_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttbf162iubs %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i8> @llvm.fptoui.sat.v8i8.v8bf16(<8 x bfloat> %f) + ret <8 x i8> %x +} + +define <16 x i8> @test_unsigned_v16i8_v16bf16(<16 x bfloat> %f) nounwind { +; CHECK-LABEL: test_unsigned_v16i8_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttbf162iubs %ymm0, %ymm0 +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %x = call <16 x i8> @llvm.fptoui.sat.v16i8.v16bf16(<16 x bfloat> %f) + ret <16 x i8> %x +} + +; VCVTTBF162IBS +define <8 x i8> @test_signed_v8i8_v8bf16(<8 x bfloat> %f) nounwind { +; CHECK-LABEL: test_signed_v8i8_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttbf162ibs %xmm0, %xmm0 +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %x = call <8 x i8> @llvm.fptosi.sat.v8i8.v8bf16(<8 x bfloat> %f) + ret <8 x i8> %x +} + +define <16 x i8> @test_signed_v16i8_v16bf16(<16 x bfloat> %f) nounwind { +; CHECK-LABEL: test_signed_v16i8_v16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttbf162ibs %ymm0, %ymm0 +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %x = call <16 x i8> @llvm.fptosi.sat.v16i8.v16bf16(<16 x bfloat> %f) + ret <16 x i8> %x +} diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll index ebb5bc9069890..8dc76308edfc0 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -13,13 +13,13 @@ ; 128-bit Vectors ; -define i64 @test_reduce_v2i64(<2 x i64> %a0) { +define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -67,19 +67,18 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE42-NEXT: movq %xmm0, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovgq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1OR2-LABEL: test_reduce_v2i64: ; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1OR2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1OR2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1OR2-NEXT: vmovq %xmm0, %rax +; X64-AVX1OR2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1OR2-NEXT: vmovq %xmm0, %rcx +; X64-AVX1OR2-NEXT: cmpq %rax, %rcx +; X64-AVX1OR2-NEXT: cmovgq %rcx, %rax ; X64-AVX1OR2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v2i64: @@ -95,7 +94,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ret i64 %4 } -define i32 @test_reduce_v4i32(<4 x i32> %a0) { +define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -171,7 +170,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ret i32 %7 } -define i16 @test_reduce_v8i16(<8 x i16> %a0) { +define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -255,7 +254,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v16i8(<16 x i8> %a0) { +define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -270,21 +269,18 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -332,14 +328,11 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -396,7 +389,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; 256-bit Vectors ; -define i64 @test_reduce_v4i64(<4 x i64> %a0) { +define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] @@ -415,9 +408,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -503,11 +496,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovgq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v4i64: @@ -515,10 +507,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovgq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -527,10 +519,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovgq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -553,7 +545,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ret i64 %7 } -define i32 @test_reduce_v8i32(<8 x i32> %a0) { +define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -686,7 +678,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ret i32 %10 } -define i16 @test_reduce_v16i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 @@ -810,7 +802,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ret i16 %13 } -define i8 @test_reduce_v32i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -830,21 +822,18 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -915,14 +904,11 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X64-SSE2-NEXT: pand %xmm2, %xmm1 -; X64-SSE2-NEXT: pandn %xmm0, %xmm2 -; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -1003,13 +989,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; 512-bit Vectors ; -define i64 @test_reduce_v8i64(<8 x i64> %a0) { +define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 ; X86-SSE2-NEXT: pxor %xmm4, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 ; X86-SSE2-NEXT: pxor %xmm4, %xmm6 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 @@ -1019,42 +1005,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm5, %xmm6 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; X86-SSE2-NEXT: por %xmm6, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm5 -; X86-SSE2-NEXT: por %xmm0, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm5 -; X86-SSE2-NEXT: pandn %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm5, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 @@ -1191,11 +1177,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: movapd %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovgq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v8i64: @@ -1208,10 +1193,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovgq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1222,10 +1207,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovgq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -1253,7 +1238,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ret i64 %10 } -define i32 @test_reduce_v16i32(<16 x i32> %a0) { +define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 @@ -1423,7 +1408,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ret i32 %13 } -define i16 @test_reduce_v32i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1 @@ -1568,7 +1553,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ret i16 %16 } -define i8 @test_reduce_v64i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 @@ -1598,21 +1583,18 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -1699,14 +1681,11 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -1798,7 +1777,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; Partial Vector Reductions ; -define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1885,7 +1864,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ret i16 %10 } -define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1972,7 +1951,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1987,21 +1966,18 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2050,14 +2026,11 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -2112,7 +2085,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ret i8 %13 } -define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -2127,21 +2100,18 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2190,14 +2160,11 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll index 5e93f93a6d599..197f3ecf2290d 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -13,13 +13,13 @@ ; 128-bit Vectors ; -define i64 @test_reduce_v2i64(<2 x i64> %a0) { +define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 @@ -68,20 +68,18 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE42-NEXT: movq %xmm0, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovlq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1OR2-LABEL: test_reduce_v2i64: ; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1OR2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1OR2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1OR2-NEXT: vmovq %xmm0, %rax +; X64-AVX1OR2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1OR2-NEXT: vmovq %xmm0, %rcx +; X64-AVX1OR2-NEXT: cmpq %rax, %rcx +; X64-AVX1OR2-NEXT: cmovlq %rcx, %rax ; X64-AVX1OR2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v2i64: @@ -97,7 +95,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ret i64 %4 } -define i32 @test_reduce_v4i32(<4 x i32> %a0) { +define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -173,7 +171,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ret i32 %7 } -define i16 @test_reduce_v8i16(<8 x i16> %a0) { +define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -257,7 +255,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v16i8(<16 x i8> %a0) { +define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -272,21 +270,18 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -334,14 +329,11 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -398,7 +390,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; 256-bit Vectors ; -define i64 @test_reduce_v4i64(<4 x i64> %a0) { +define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] @@ -417,9 +409,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 @@ -507,11 +499,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovlq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v4i64: @@ -519,10 +510,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovlq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -531,10 +522,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovlq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -557,7 +548,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ret i64 %7 } -define i32 @test_reduce_v8i32(<8 x i32> %a0) { +define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -690,7 +681,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ret i32 %10 } -define i16 @test_reduce_v16i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 @@ -814,7 +805,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ret i16 %13 } -define i8 @test_reduce_v32i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -834,21 +825,18 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -919,14 +907,11 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X64-SSE2-NEXT: pand %xmm2, %xmm1 -; X64-SSE2-NEXT: pandn %xmm0, %xmm2 -; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -1007,13 +992,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; 512-bit Vectors ; -define i64 @test_reduce_v8i64(<8 x i64> %a0) { +define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 ; X86-SSE2-NEXT: pxor %xmm4, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 ; X86-SSE2-NEXT: pxor %xmm4, %xmm6 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 @@ -1023,42 +1008,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm5, %xmm6 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; X86-SSE2-NEXT: por %xmm6, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm5 -; X86-SSE2-NEXT: por %xmm1, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm3, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 @@ -1195,11 +1180,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: movapd %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovlq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v8i64: @@ -1212,10 +1196,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovlq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1226,10 +1210,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovlq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -1257,7 +1241,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ret i64 %10 } -define i32 @test_reduce_v16i32(<16 x i32> %a0) { +define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 @@ -1427,7 +1411,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ret i32 %13 } -define i16 @test_reduce_v32i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminsw %xmm3, %xmm1 @@ -1572,7 +1556,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ret i16 %16 } -define i8 @test_reduce_v64i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 @@ -1602,21 +1586,18 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -1703,14 +1684,11 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -1802,7 +1780,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; Partial Vector Reductions ; -define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1889,7 +1867,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ret i16 %10 } -define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1976,7 +1954,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1991,21 +1969,18 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2054,14 +2029,11 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -2116,7 +2088,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ret i8 %13 } -define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -2131,21 +2103,18 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2194,14 +2163,11 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index aa2b6bacdd902..29fa565023c26 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -13,13 +13,13 @@ ; 128-bit Vectors ; -define i64 @test_reduce_v2i64(<2 x i64> %a0) { +define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -40,37 +40,37 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: pxor %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm2, %xmm3 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X86-SSE42-NEXT: movd %xmm2, %eax -; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: pxor %xmm2, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pxor %xmm3, %xmm2 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movd %xmm3, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm3, %edx ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: ## xmm1 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_reduce_v2i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: retl @@ -86,37 +86,26 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: pxor %xmm2, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm3, %xmm2 -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE42-NEXT: movq %xmm3, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE42-NEXT: movq %xmm0, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovaq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX1-NEXT: ## xmm1 = mem[0,0] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovaq %rcx, %rax ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovaq %rcx, %rax ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v2i64: @@ -132,7 +121,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ret i64 %4 } -define i32 @test_reduce_v4i32(<4 x i32> %a0) { +define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] @@ -214,7 +203,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ret i32 %7 } -define i16 @test_reduce_v8i16(<8 x i16> %a0) { +define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -223,11 +212,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 ; X86-SSE2-NEXT: paddw %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmoval %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -259,11 +247,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 ; X64-SSE2-NEXT: paddw %xmm1, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmoval %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -318,7 +305,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v16i8(<16 x i8> %a0) { +define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -443,7 +430,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; 256-bit Vectors ; -define i64 @test_reduce_v4i64(<4 x i64> %a0) { +define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] @@ -462,9 +449,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -491,9 +478,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: movapd %xmm1, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 ; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -510,11 +497,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -528,11 +515,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -565,19 +552,16 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v4i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE42-NEXT: pxor %xmm3, %xmm4 -; X64-SSE42-NEXT: pxor %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: movapd %xmm1, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE42-NEXT: pxor %xmm0, %xmm3 +; X64-SSE42-NEXT: pxor %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovaq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v4i64: @@ -586,15 +570,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX1-NEXT: ## xmm2 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovaq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -603,15 +585,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovaq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -634,7 +614,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ret i64 %7 } -define i32 @test_reduce_v8i32(<8 x i32> %a0) { +define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] @@ -779,7 +759,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ret i32 %10 } -define i16 @test_reduce_v16i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 @@ -790,11 +770,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X86-SSE2-NEXT: paddw %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmoval %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -845,11 +824,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X64-SSE2-NEXT: paddw %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmoval %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -917,7 +895,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ret i16 %13 } -define i8 @test_reduce_v32i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 @@ -1076,13 +1054,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; 512-bit Vectors ; -define i64 @test_reduce_v8i64(<8 x i64> %a0) { +define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 ; X86-SSE2-NEXT: pxor %xmm4, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 ; X86-SSE2-NEXT: pxor %xmm4, %xmm6 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 @@ -1092,42 +1070,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm5, %xmm6 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; X86-SSE2-NEXT: por %xmm6, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm5 -; X86-SSE2-NEXT: por %xmm0, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm3, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm5 -; X86-SSE2-NEXT: pandn %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm5, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 @@ -1149,26 +1127,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm6 ; X86-SSE42-NEXT: pxor %xmm5, %xmm6 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; X86-SSE42-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE42-NEXT: pxor %xmm5, %xmm4 ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pxor %xmm5, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; X86-SSE42-NEXT: movapd %xmm3, %xmm1 ; X86-SSE42-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE42-NEXT: pxor %xmm5, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; X86-SSE42-NEXT: movapd %xmm2, %xmm0 ; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: movapd %xmm3, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 ; X86-SSE42-NEXT: pxor %xmm1, %xmm5 ; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 @@ -1178,27 +1157,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm4 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm5 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm5, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm3, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -1216,11 +1195,11 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -1298,17 +1277,14 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; X64-SSE42-NEXT: movapd %xmm2, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm2, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: movapd %xmm3, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm1, %xmm5 -; X64-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovaq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v8i64: @@ -1329,12 +1305,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovaq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1347,15 +1321,13 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovaq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -1383,7 +1355,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ret i64 %10 } -define i32 @test_reduce_v16i32(<16 x i32> %a0) { +define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] @@ -1577,7 +1549,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ret i32 %13 } -define i16 @test_reduce_v32i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: psubusw %xmm0, %xmm2 @@ -1592,11 +1564,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X86-SSE2-NEXT: paddw %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmoval %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1657,11 +1628,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X64-SSE2-NEXT: paddw %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmoval %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1740,7 +1710,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ret i16 %16 } -define i8 @test_reduce_v64i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1 @@ -1920,7 +1890,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; Partial Vector Reductions ; -define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1929,11 +1899,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 ; X86-SSE2-NEXT: paddw %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmoval %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1966,11 +1935,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 ; X64-SSE2-NEXT: paddw %xmm1, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmoval %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -2028,7 +1996,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ret i16 %10 } -define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -2037,11 +2005,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 ; X86-SSE2-NEXT: paddw %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X86-SSE2-NEXT: paddw %xmm0, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmoval %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -2074,11 +2041,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 ; X64-SSE2-NEXT: paddw %xmm1, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 -; X64-SSE2-NEXT: paddw %xmm0, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmoval %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -2136,7 +2102,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -2261,7 +2227,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ret i8 %13 } -define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index a7ab20f246fb5..835b3c86a1a95 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -13,13 +13,13 @@ ; 128-bit Vectors ; -define i64 @test_reduce_v2i64(<2 x i64> %a0) { +define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v2i64: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 @@ -40,38 +40,38 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X86-SSE42-LABEL: test_reduce_v2i64: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE42-NEXT: pxor %xmm0, %xmm3 -; X86-SSE42-NEXT: pxor %xmm2, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X86-SSE42-NEXT: movd %xmm2, %eax -; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE42-NEXT: pxor %xmm0, %xmm2 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movd %xmm3, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm3, %edx ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: ## xmm1 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_reduce_v2i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: retl @@ -87,38 +87,26 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v2i64: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE42-NEXT: pxor %xmm0, %xmm2 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE42-NEXT: movq %xmm3, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE42-NEXT: movq %xmm0, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovbq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v2i64: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX1-NEXT: ## xmm1 = mem[0,0] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovbq %rcx, %rax ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovbq %rcx, %rax ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v2i64: @@ -134,7 +122,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ret i64 %4 } -define i32 @test_reduce_v4i32(<4 x i32> %a0) { +define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] @@ -216,7 +204,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ret i32 %7 } -define i16 @test_reduce_v8i16(<8 x i16> %a0) { +define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -227,12 +215,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmovbl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -260,12 +246,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmovbl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -295,7 +279,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v16i8(<16 x i8> %a0) { +define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -385,7 +369,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; 256-bit Vectors ; -define i64 @test_reduce_v4i64(<4 x i64> %a0) { +define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v4i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] @@ -404,9 +388,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 @@ -434,9 +418,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-SSE42-NEXT: pxor %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movapd %xmm1, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm2, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm2, %xmm0 ; X86-SSE42-NEXT: pxor %xmm3, %xmm2 ; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm2 ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 @@ -454,11 +438,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -466,17 +450,17 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: -; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm4 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -509,21 +493,16 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-SSE42-LABEL: test_reduce_v4i64: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 -; X64-SSE42-NEXT: pxor %xmm3, %xmm4 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: pxor %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE42-NEXT: pxor %xmm0, %xmm3 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: movapd %xmm1, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm2, %xmm3 -; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 -; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovbq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v4i64: @@ -532,15 +511,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: ## xmm1 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovbq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -549,15 +526,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovbq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -580,7 +555,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ret i64 %7 } -define i32 @test_reduce_v8i32(<8 x i32> %a0) { +define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] @@ -725,7 +700,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) { ret i32 %10 } -define i16 @test_reduce_v16i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -739,12 +714,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmovbl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -789,12 +762,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmovbl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -851,7 +822,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ret i16 %13 } -define i8 @test_reduce_v32i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 @@ -990,13 +961,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; 512-bit Vectors ; -define i64 @test_reduce_v8i64(<8 x i64> %a0) { +define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v8i64: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 ; X86-SSE2-NEXT: pxor %xmm4, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 ; X86-SSE2-NEXT: pxor %xmm4, %xmm6 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 @@ -1006,42 +977,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-SSE2-NEXT: pand %xmm5, %xmm6 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; X86-SSE2-NEXT: por %xmm6, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm5 -; X86-SSE2-NEXT: por %xmm1, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm3, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 @@ -1061,32 +1032,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v8i64: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE42-NEXT: pxor %xmm4, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; X86-SSE42-NEXT: movapd %xmm2, %xmm5 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm5 ; X86-SSE42-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE42-NEXT: pxor %xmm5, %xmm6 +; X86-SSE42-NEXT: pxor %xmm4, %xmm6 ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X86-SSE42-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE42-NEXT: pxor %xmm5, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; X86-SSE42-NEXT: movapd %xmm2, %xmm1 -; X86-SSE42-NEXT: xorpd %xmm5, %xmm1 ; X86-SSE42-NEXT: movapd %xmm3, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: movapd %xmm3, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm5 -; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 -; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm4 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; X86-SSE42-NEXT: movd %xmm1, %eax ; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx @@ -1094,27 +1065,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm5, %xmm6 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm5 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm5 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -1127,16 +1098,16 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm4 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -1198,34 +1169,30 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v8i64: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm5 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] ; X64-SSE42-NEXT: movdqa %xmm0, %xmm6 -; X64-SSE42-NEXT: pxor %xmm4, %xmm6 +; X64-SSE42-NEXT: pxor %xmm5, %xmm6 ; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; X64-SSE42-NEXT: movapd %xmm2, %xmm5 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm5 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movapd %xmm2, %xmm4 +; X64-SSE42-NEXT: xorpd %xmm5, %xmm4 ; X64-SSE42-NEXT: movdqa %xmm1, %xmm6 -; X64-SSE42-NEXT: pxor %xmm4, %xmm6 +; X64-SSE42-NEXT: pxor %xmm5, %xmm6 ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE42-NEXT: movapd %xmm3, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm3, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: movapd %xmm3, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm1, %xmm4 -; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 -; X64-SSE42-NEXT: movdqa %xmm4, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovbq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v8i64: @@ -1246,12 +1213,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovbq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1264,15 +1229,13 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovbq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -1300,7 +1263,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ret i64 %10 } -define i32 @test_reduce_v16i32(<16 x i32> %a0) { +define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] @@ -1494,7 +1457,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) { ret i32 %13 } -define i16 @test_reduce_v32i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 @@ -1514,12 +1477,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmovbl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1576,12 +1537,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmovbl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1649,7 +1608,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ret i16 %16 } -define i8 @test_reduce_v64i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminub %xmm3, %xmm1 @@ -1809,7 +1768,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; Partial Vector Reductions ; -define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { +define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1820,12 +1779,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmovbl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1854,12 +1811,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmovbl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1890,7 +1845,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ret i16 %10 } -define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { +define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1901,12 +1856,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmovbl %ecx, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1935,12 +1888,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmovbl %ecx, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -1971,7 +1922,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ret i16 %10 } -define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { +define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -2059,7 +2010,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ret i8 %13 } -define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { +define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind { ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll index 8159468722596..65231c484db98 100644 --- a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll +++ b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll @@ -21,11 +21,10 @@ define i8 @ctz_v8i16(<8 x i16> %a) { ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: psubusw %xmm0, %xmm1 ; CHECK-NEXT: paddw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: psubusw %xmm1, %xmm0 -; CHECK-NEXT: paddw %xmm1, %xmm0 -; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: pextrw $1, %xmm1, %ecx +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: cmoval %eax, %ecx ; CHECK-NEXT: movl $8, %eax ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax @@ -90,11 +89,10 @@ define i8 @ctz_v8i16_poison(<8 x i16> %a) { ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: psubusw %xmm0, %xmm1 ; CHECK-NEXT: paddw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: psubusw %xmm1, %xmm0 -; CHECK-NEXT: paddw %xmm1, %xmm0 -; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: pextrw $1, %xmm1, %ecx +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: cmoval %eax, %ecx ; CHECK-NEXT: movl $8, %eax ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll index 1ccd1d11fc7aa..19f54edd05ac1 100644 --- a/llvm/test/CodeGen/X86/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll @@ -164,11 +164,10 @@ define i32 @extract_last_active_v8i32(<8 x i32> %a, <8 x i1> %c) { ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: psubusw %xmm0, %xmm1 ; CHECK-NEXT: paddw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: psubusw %xmm1, %xmm0 -; CHECK-NEXT: paddw %xmm1, %xmm0 -; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: pextrw $1, %xmm1, %ecx +; CHECK-NEXT: movd %xmm1, %edx +; CHECK-NEXT: cmpw %cx, %dx +; CHECK-NEXT: cmoval %edx, %ecx ; CHECK-NEXT: andl $7, %ecx ; CHECK-NEXT: orl -40(%rsp,%rcx,4), %eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll index a302649decee8..a27756f71ca78 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -19,10 +19,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-LABEL: test_v2i64: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] ; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE41-NEXT: pxor %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE41-NEXT: pxor %xmm2, %xmm0 ; X86-SSE41-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -70,25 +70,13 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE41-NEXT: retl ; -; X64-SSE41-LABEL: test_v2i64: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] -; X64-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE41-NEXT: pxor %xmm0, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm2, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm5, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: por %xmm2, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE41-NEXT: movq %xmm3, %rax -; X64-SSE41-NEXT: retq +; X64-SSE4-LABEL: test_v2i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE4-NEXT: movq %xmm0, %rcx +; X64-SSE4-NEXT: cmpq %rax, %rcx +; X64-SSE4-NEXT: cmovgq %rcx, %rax +; X64-SSE4-NEXT: retq ; ; X86-SSE42-LABEL: test_v2i64: ; X86-SSE42: # %bb.0: @@ -100,15 +88,6 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE42-NEXT: retl ; -; X64-SSE42-LABEL: test_v2i64: -; X64-SSE42: # %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax -; X64-SSE42-NEXT: retq -; ; X86-AVX-LABEL: test_v2i64: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -120,10 +99,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; ; X64-AVX-LABEL: test_v2i64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, %rax +; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX-NEXT: vmovq %xmm0, %rcx +; X64-AVX-NEXT: cmpq %rax, %rcx +; X64-AVX-NEXT: cmovgq %rcx, %rax ; X64-AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: @@ -164,9 +143,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -225,11 +204,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: por %xmm4, %xmm0 ; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE41-NEXT: movapd %xmm1, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm3, %xmm0 ; X86-SSE41-NEXT: pxor %xmm2, %xmm3 -; X86-SSE41-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE41-NEXT: movapd %xmm0, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 ; X86-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 @@ -245,34 +224,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE41-LABEL: test_v4i64: ; X64-SSE41: # %bb.0: ; X64-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; X64-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm2, %xmm4 -; X64-SSE41-NEXT: pxor %xmm3, %xmm4 -; X64-SSE41-NEXT: movdqa %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE41-NEXT: movapd %xmm1, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm2, %xmm3 -; X64-SSE41-NEXT: movapd %xmm0, %xmm4 +; X64-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; X64-SSE41-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE41-NEXT: pxor %xmm0, %xmm3 +; X64-SSE41-NEXT: pxor %xmm2, %xmm0 +; X64-SSE41-NEXT: movdqa %xmm0, %xmm4 ; X64-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: pand %xmm5, %xmm3 ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; X64-SSE41-NEXT: por %xmm3, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE41-NEXT: movq %xmm2, %rax +; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE41-NEXT: movq %xmm1, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovgq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v4i64: @@ -293,11 +261,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovgq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v4i64: @@ -318,10 +285,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovgq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -343,10 +310,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovgq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -381,56 +348,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm4 ; X86-SSE2-NEXT: pxor %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm5, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm7 -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm5, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pandn %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pandn %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm4, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 @@ -511,60 +478,60 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE41-NEXT: andl $-16, %esp ; X86-SSE41-NEXT: subl $16, %esp ; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm4 +; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; X86-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE41-NEXT: pxor %xmm5, %xmm6 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm5 -; X86-SSE41-NEXT: pxor %xmm4, %xmm5 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm7 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: por %xmm7, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE41-NEXT: pxor %xmm4, %xmm3 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE41-NEXT: pxor %xmm5, %xmm1 +; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm3 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE41-NEXT: movapd %xmm5, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm2, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; X86-SSE41-NEXT: movapd %xmm4, %xmm1 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; X86-SSE41-NEXT: movapd %xmm2, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm3 +; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm1, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; X86-SSE41-NEXT: movapd %xmm4, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; X86-SSE41-NEXT: pxor %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm0, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -603,31 +570,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X64-SSE41-NEXT: por %xmm6, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; X64-SSE41-NEXT: movapd %xmm2, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm2, %xmm5 +; X64-SSE41-NEXT: movapd %xmm5, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm5 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm4, %xmm1 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm1, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE41-NEXT: movapd %xmm3, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm5 -; X64-SSE41-NEXT: movapd %xmm0, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm4 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm4, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm5, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE41-NEXT: movq %xmm3, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovgq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v8i64: @@ -669,11 +625,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-SSE42-NEXT: movapd %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovgq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v8i64: @@ -704,10 +659,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovgq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -733,10 +688,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovgq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -774,124 +729,128 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movaps %xmm2, (%esp) # 16-byte Spill -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movaps %xmm0, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm1 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; X86-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm7 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: pandn %xmm6, %xmm4 -; X86-SSE2-NEXT: por %xmm1, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 -; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm7, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: pxor %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm4 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm6 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm5, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; X86-SSE2-NEXT: pand %xmm5, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: por %xmm0, %xmm5 -; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pxor %xmm3, %xmm0 ; X86-SSE2-NEXT: pand %xmm5, %xmm2 ; X86-SSE2-NEXT: pandn %xmm6, %xmm5 ; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm1 +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm7 +; X86-SSE2-NEXT: pandn %xmm5, %xmm4 +; X86-SSE2-NEXT: por %xmm7, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa (%esp), %xmm7 # 16-byte Reload -; X86-SSE2-NEXT: movdqa %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa (%esp), %xmm5 # 16-byte Reload +; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm6 -; X86-SSE2-NEXT: pand %xmm6, %xmm7 -; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm2 -; X86-SSE2-NEXT: pandn %xmm2, %xmm6 -; X86-SSE2-NEXT: por %xmm7, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: pand %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm5 +; X86-SSE2-NEXT: pandn %xmm5, %xmm2 +; X86-SSE2-NEXT: por %xmm6, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: por %xmm2, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm5 -; X86-SSE2-NEXT: pandn %xmm6, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm5, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 -; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm2, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm5, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 -; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: por %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm4, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: pxor %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -900,9 +859,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -1029,32 +988,31 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pushl %ebp ; X86-SSE41-NEXT: movl %esp, %ebp ; X86-SSE41-NEXT: andl $-16, %esp -; X86-SSE41-NEXT: subl $48, %esp -; X86-SSE41-NEXT: movaps %xmm2, (%esp) # 16-byte Spill -; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm2 -; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] -; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: subl $32, %esp +; X86-SSE41-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE41-NEXT: pxor %xmm4, %xmm2 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm7 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE41-NEXT: por %xmm7, %xmm0 -; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm5 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; X86-SSE41-NEXT: movapd %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movaps %xmm0, (%esp) # 16-byte Spill +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm7 +; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm4 +; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; X86-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm7, %xmm6 +; X86-SSE41-NEXT: pxor %xmm5, %xmm6 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; X86-SSE41-NEXT: pand %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm6 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 ; X86-SSE41-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 +; X86-SSE41-NEXT: pxor %xmm5, %xmm1 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 @@ -1063,54 +1021,40 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pand %xmm0, %xmm1 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm7 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE41-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: movapd %xmm4, %xmm1 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; X86-SSE41-NEXT: movapd %xmm6, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm1, %xmm7 ; X86-SSE41-NEXT: movdqa 56(%ebp), %xmm1 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE41-NEXT: por %xmm7, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa (%esp), %xmm6 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm6, %xmm2 -; X86-SSE41-NEXT: pxor %xmm4, %xmm2 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm7, %xmm2 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm2 -; X86-SSE41-NEXT: movapd %xmm2, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE41-NEXT: pxor %xmm5, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X86-SSE41-NEXT: movapd %xmm5, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm6, %xmm2 -; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa (%esp), %xmm7 # 16-byte Reload +; X86-SSE41-NEXT: movdqa %xmm7, %xmm2 +; X86-SSE41-NEXT: pxor %xmm5, %xmm2 ; X86-SSE41-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 @@ -1119,33 +1063,46 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pand %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; X86-SSE41-NEXT: movapd %xmm5, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 ; X86-SSE41-NEXT: movapd %xmm1, %xmm2 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm2 -; X86-SSE41-NEXT: movapd %xmm2, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm2 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; X86-SSE41-NEXT: movapd %xmm6, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm3 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm7 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; X86-SSE41-NEXT: por %xmm7, %xmm0 +; X86-SSE41-NEXT: movapd %xmm4, %xmm2 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; X86-SSE41-NEXT: movapd %xmm1, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm3 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; X86-SSE41-NEXT: movapd %xmm4, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; X86-SSE41-NEXT: pxor %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm0, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -1236,31 +1193,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; X64-SSE41-NEXT: movapd %xmm6, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm6, %xmm9 +; X64-SSE41-NEXT: movapd %xmm9, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm9 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm2, %xmm1 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm1, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE41-NEXT: movapd %xmm7, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm9 -; X64-SSE41-NEXT: movapd %xmm0, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm9, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm3 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE41-NEXT: movq %xmm7, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovgq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v16i64: @@ -1271,31 +1217,31 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE42-NEXT: subl $16, %esp ; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE42-NEXT: movdqa 72(%ebp), %xmm4 -; X86-SSE42-NEXT: movdqa 56(%ebp), %xmm5 -; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; X86-SSE42-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; X86-SSE42-NEXT: movapd %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; X86-SSE42-NEXT: movdqa 56(%ebp), %xmm1 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; X86-SSE42-NEXT: movdqa 24(%ebp), %xmm2 ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm3 -; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; X86-SSE42-NEXT: movdqa 40(%ebp), %xmm3 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X86-SSE42-NEXT: movapd %xmm3, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; X86-SSE42-NEXT: movapd %xmm2, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE42-NEXT: movapd %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: movapd %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 @@ -1330,11 +1276,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE42-NEXT: movapd %xmm6, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm7, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE42-NEXT: movq %xmm7, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovgq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v16i64: @@ -1343,25 +1288,25 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-AVX1-NEXT: movl %esp, %ebp ; X86-AVX1-NEXT: andl $-32, %esp ; X86-AVX1-NEXT: subl $32, %esp -; X86-AVX1-NEXT: vmovdqa 8(%ebp), %xmm3 -; X86-AVX1-NEXT: vmovdqa 24(%ebp), %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 -; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 -; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm5 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm6 -; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X86-AVX1-NEXT: vmovdqa 8(%ebp), %xmm4 +; X86-AVX1-NEXT: vmovdqa 24(%ebp), %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm6 +; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm3, %xmm5, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm7 +; X86-AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm5, %xmm3, %xmm1 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1392,10 +1337,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovgq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1436,10 +1381,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovgq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -2798,21 +2743,18 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2837,14 +2779,11 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -2939,21 +2878,18 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2983,14 +2919,11 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X64-SSE2-NEXT: pand %xmm2, %xmm1 -; X64-SSE2-NEXT: pandn %xmm0, %xmm2 -; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -3142,21 +3075,18 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -3198,14 +3128,11 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -3403,21 +3330,18 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovgl %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -3479,14 +3403,11 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X64-SSE2-NEXT: pand %xmm2, %xmm1 -; X64-SSE2-NEXT: pandn %xmm0, %xmm2 -; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovgl %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll index c010290c0d60d..2d2397d0f2454 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -19,10 +19,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 @@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-LABEL: test_v2i64: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] ; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE41-NEXT: pxor %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE41-NEXT: pxor %xmm2, %xmm0 ; X86-SSE41-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 @@ -70,25 +70,13 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE41-NEXT: retl ; -; X64-SSE41-LABEL: test_v2i64: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] -; X64-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE41-NEXT: pxor %xmm0, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm0, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm5, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: por %xmm2, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE41-NEXT: movq %xmm3, %rax -; X64-SSE41-NEXT: retq +; X64-SSE4-LABEL: test_v2i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE4-NEXT: movq %xmm0, %rcx +; X64-SSE4-NEXT: cmpq %rax, %rcx +; X64-SSE4-NEXT: cmovlq %rcx, %rax +; X64-SSE4-NEXT: retq ; ; X86-SSE42-LABEL: test_v2i64: ; X86-SSE42: # %bb.0: @@ -101,16 +89,6 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE42-NEXT: retl ; -; X64-SSE42-LABEL: test_v2i64: -; X64-SSE42: # %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax -; X64-SSE42-NEXT: retq -; ; X86-AVX-LABEL: test_v2i64: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -122,10 +100,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; ; X64-AVX-LABEL: test_v2i64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, %rax +; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX-NEXT: vmovq %xmm0, %rcx +; X64-AVX-NEXT: cmpq %rax, %rcx +; X64-AVX-NEXT: cmovlq %rcx, %rax ; X64-AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: @@ -166,9 +144,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 @@ -226,9 +204,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: por %xmm4, %xmm0 ; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE41-NEXT: movapd %xmm1, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm3, %xmm0 ; X86-SSE41-NEXT: pxor %xmm2, %xmm3 ; X86-SSE41-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -246,33 +224,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE41-LABEL: test_v4i64: ; X64-SSE41: # %bb.0: ; X64-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE41-NEXT: pxor %xmm3, %xmm4 -; X64-SSE41-NEXT: movdqa %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE41-NEXT: movapd %xmm1, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm2, %xmm3 -; X64-SSE41-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; X64-SSE41-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE41-NEXT: pxor %xmm0, %xmm3 +; X64-SSE41-NEXT: pxor %xmm1, %xmm0 +; X64-SSE41-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: pand %xmm5, %xmm3 ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; X64-SSE41-NEXT: por %xmm3, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE41-NEXT: movq %xmm2, %rax +; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE41-NEXT: movq %xmm1, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovlq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v4i64: @@ -295,11 +263,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovlq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v4i64: @@ -320,10 +287,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovlq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -345,10 +312,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovlq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -383,56 +350,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm6, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: pandn %xmm5, %xmm4 -; X86-SSE2-NEXT: por %xmm1, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm5, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm4, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -513,60 +480,59 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE41-NEXT: andl $-16, %esp ; X86-SSE41-NEXT: subl $16, %esp ; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm4 -; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm5, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm4, %xmm6 -; X86-SSE41-NEXT: pxor %xmm5, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm6 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; X86-SSE41-NEXT: por %xmm6, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE41-NEXT: pxor %xmm5, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor %xmm5, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] +; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE41-NEXT: pxor %xmm4, %xmm5 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm6 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm7 +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm7, %xmm0 ; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE41-NEXT: movapd %xmm2, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X86-SSE41-NEXT: movapd %xmm4, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm5, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm4, %xmm0 -; X86-SSE41-NEXT: pxor %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm5 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm3 +; X86-SSE41-NEXT: pxor %xmm4, %xmm3 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: por %xmm3, %xmm0 +; X86-SSE41-NEXT: movapd %xmm2, %xmm3 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm3 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm3, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; X86-SSE41-NEXT: pxor %xmm1, %xmm4 +; X86-SSE41-NEXT: movdqa %xmm4, %xmm2 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -604,31 +570,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X64-SSE41-NEXT: por %xmm6, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE41-NEXT: movapd %xmm3, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm1 -; X64-SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm3, %xmm5 +; X64-SSE41-NEXT: movapd %xmm5, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm1, %xmm4 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm4, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE41-NEXT: movapd %xmm3, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm5 -; X64-SSE41-NEXT: movdqa %xmm5, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm4 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm4, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm5, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE41-NEXT: movq %xmm3, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovlq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v8i64: @@ -670,11 +625,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-SSE42-NEXT: movapd %xmm3, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovlq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v8i64: @@ -705,10 +659,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovlq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -734,10 +688,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovlq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -774,127 +728,124 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp -; X86-SSE2-NEXT: subl $48, %esp -; X86-SSE2-NEXT: movaps %xmm1, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm6 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm2 -; X86-SSE2-NEXT: pandn %xmm5, %xmm4 -; X86-SSE2-NEXT: por %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm0 +; X86-SSE2-NEXT: pandn %xmm0, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm5, %xmm0 -; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm7 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm6, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm7, %xmm2 +; X86-SSE2-NEXT: pandn %xmm6, %xmm7 +; X86-SSE2-NEXT: por %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 ; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm6 -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pandn %xmm5, %xmm0 -; X86-SSE2-NEXT: por %xmm7, %xmm0 -; X86-SSE2-NEXT: movdqa (%esp), %xmm4 # 16-byte Reload +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm7, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm5 -; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm2 +; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm1, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm4 -; X86-SSE2-NEXT: pandn %xmm6, %xmm5 -; X86-SSE2-NEXT: por %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm6, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 ; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm4 ; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm7, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm2 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 -; X86-SSE2-NEXT: por %xmm2, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm5, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm4, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm5 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm5, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm0, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -903,9 +854,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -1033,29 +984,26 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: movl %esp, %ebp ; X86-SSE41-NEXT: andl $-16, %esp ; X86-SSE41-NEXT: subl $48, %esp -; X86-SSE41-NEXT: movaps %xmm1, (%esp) # 16-byte Spill +; X86-SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: movdqa 56(%ebp), %xmm1 +; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm6 ; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648] -; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE41-NEXT: pxor %xmm4, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm5 +; X86-SSE41-NEXT: pxor %xmm4, %xmm5 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE41-NEXT: movdqa 56(%ebp), %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE41-NEXT: movapd %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE41-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm1 ; X86-SSE41-NEXT: pxor %xmm4, %xmm1 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 @@ -1063,94 +1011,93 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm2 +; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm7 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm3 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 -; X86-SSE41-NEXT: movdqa (%esp), %xmm5 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm6, %xmm1 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm7 +; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm7 +; X86-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm1, %xmm2 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, (%esp) # 16-byte Spill +; X86-SSE41-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm6 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; X86-SSE41-NEXT: movapd %xmm3, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm2, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm7 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE41-NEXT: movapd %xmm6, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: movapd %xmm5, %xmm2 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; X86-SSE41-NEXT: movapd %xmm3, %xmm0 ; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm0 -; X86-SSE41-NEXT: movapd %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm0, %xmm6 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm7 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: por %xmm7, %xmm0 +; X86-SSE41-NEXT: movdqa (%esp), %xmm7 # 16-byte Reload +; X86-SSE41-NEXT: movdqa %xmm7, %xmm2 +; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; X86-SSE41-NEXT: movapd %xmm3, %xmm0 ; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm2, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 +; X86-SSE41-NEXT: movapd %xmm3, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm4, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE41-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; X86-SSE41-NEXT: pmovsxdq %xmm3, %xmm0 +; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm4 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm4, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -1240,31 +1187,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X64-SSE41-NEXT: por %xmm1, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; X64-SSE41-NEXT: movapd %xmm7, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm1 -; X64-SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm7, %xmm9 +; X64-SSE41-NEXT: movapd %xmm9, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm1, %xmm2 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE41-NEXT: movapd %xmm7, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm9 -; X64-SSE41-NEXT: movdqa %xmm9, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm3 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE41-NEXT: movq %xmm7, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovlq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v16i64: @@ -1274,26 +1210,26 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE42-NEXT: andl $-16, %esp ; X86-SSE42-NEXT: subl $16, %esp ; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE42-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE42-NEXT: movdqa 24(%ebp), %xmm4 ; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; X86-SSE42-NEXT: movdqa 72(%ebp), %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; X86-SSE42-NEXT: movdqa 24(%ebp), %xmm5 -; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; X86-SSE42-NEXT: movdqa 56(%ebp), %xmm3 ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm2 ; X86-SSE42-NEXT: movapd %xmm3, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; X86-SSE42-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; X86-SSE42-NEXT: movdqa 72(%ebp), %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; X86-SSE42-NEXT: movapd %xmm1, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm1 @@ -1334,11 +1270,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE42-NEXT: movapd %xmm7, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE42-NEXT: movq %xmm7, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovlq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v16i64: @@ -1347,25 +1282,25 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-AVX1-NEXT: movl %esp, %ebp ; X86-AVX1-NEXT: andl $-32, %esp ; X86-AVX1-NEXT: subl $32, %esp -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; X86-AVX1-NEXT: vmovdqa 8(%ebp), %xmm5 -; X86-AVX1-NEXT: vmovdqa 24(%ebp), %xmm6 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; X86-AVX1-NEXT: vblendvpd %xmm7, %xmm4, %xmm6, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm6 -; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 +; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm3 +; X86-AVX1-NEXT: vmovdqa 8(%ebp), %xmm4 +; X86-AVX1-NEXT: vmovdqa 24(%ebp), %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 +; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm6 +; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm5, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 ; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1396,10 +1331,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm0, %xmm0 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovlq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1440,10 +1375,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovlq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -2802,21 +2737,18 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2841,14 +2773,11 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -2943,21 +2872,18 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pandn %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2987,14 +2913,11 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pandn %xmm0, %xmm1 ; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X64-SSE2-NEXT: pand %xmm2, %xmm1 -; X64-SSE2-NEXT: pandn %xmm0, %xmm2 -; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -3146,21 +3069,18 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -3202,14 +3122,11 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; @@ -3407,21 +3324,18 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $16, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shrl $8, %eax +; X86-SSE2-NEXT: cmpb %al, %cl +; X86-SSE2-NEXT: cmovll %ecx, %eax ; X86-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -3483,14 +3397,11 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind { ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pandn %xmm0, %xmm2 ; X64-SSE2-NEXT: por %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE2-NEXT: psrlw $8, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 -; X64-SSE2-NEXT: pandn %xmm0, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm2, %ecx +; X64-SSE2-NEXT: movl %ecx, %eax +; X64-SSE2-NEXT: shrl $8, %eax +; X64-SSE2-NEXT: cmpb %al, %cl +; X64-SSE2-NEXT: cmovll %ecx, %eax ; X64-SSE2-NEXT: # kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 145c27e5eb976..0e78b804d9b8f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -19,10 +19,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-LABEL: test_v2i64: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE41-NEXT: pxor %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE41-NEXT: pxor %xmm2, %xmm0 ; X86-SSE41-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -70,97 +70,66 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE41-NEXT: retl ; -; X64-SSE41-LABEL: test_v2i64: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; X64-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE41-NEXT: pxor %xmm0, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm2, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm5, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: por %xmm2, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE41-NEXT: movq %xmm3, %rax -; X64-SSE41-NEXT: retq +; X64-SSE4-LABEL: test_v2i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE4-NEXT: movq %xmm0, %rcx +; X64-SSE4-NEXT: cmpq %rax, %rcx +; X64-SSE4-NEXT: cmovaq %rcx, %rax +; X64-SSE4-NEXT: retq ; ; X86-SSE42-LABEL: test_v2i64: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: pxor %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm2, %xmm3 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X86-SSE42-NEXT: movd %xmm2, %eax -; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: pxor %xmm2, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pxor %xmm3, %xmm2 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movd %xmm3, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm3, %edx ; X86-SSE42-NEXT: retl ; -; X64-SSE42-LABEL: test_v2i64: -; X64-SSE42: # %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: pxor %xmm2, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm3, %xmm2 -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE42-NEXT: movq %xmm3, %rax -; X64-SSE42-NEXT: retq -; ; X86-AVX1-LABEL: test_v2i64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: # xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: # xmm1 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: retl ; ; X64-AVX1-LABEL: test_v2i64: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX1-NEXT: # xmm1 = mem[0,0] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovaq %rcx, %rax ; X64-AVX1-NEXT: retq ; ; X86-AVX2-LABEL: test_v2i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: test_v2i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovaq %rcx, %rax ; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: @@ -201,9 +170,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -262,11 +231,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: por %xmm4, %xmm0 ; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE41-NEXT: movapd %xmm1, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm3, %xmm0 ; X86-SSE41-NEXT: pxor %xmm2, %xmm3 -; X86-SSE41-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE41-NEXT: movapd %xmm0, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 ; X86-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 @@ -282,34 +251,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE41-LABEL: test_v4i64: ; X64-SSE41: # %bb.0: ; X64-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; X64-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm2, %xmm4 -; X64-SSE41-NEXT: pxor %xmm3, %xmm4 -; X64-SSE41-NEXT: movdqa %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE41-NEXT: movapd %xmm1, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm2, %xmm3 -; X64-SSE41-NEXT: movapd %xmm0, %xmm4 +; X64-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; X64-SSE41-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE41-NEXT: pxor %xmm0, %xmm3 +; X64-SSE41-NEXT: pxor %xmm2, %xmm0 +; X64-SSE41-NEXT: movdqa %xmm0, %xmm4 ; X64-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: pand %xmm5, %xmm3 ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; X64-SSE41-NEXT: por %xmm3, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE41-NEXT: movq %xmm2, %rax +; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE41-NEXT: movq %xmm1, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovaq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v4i64: @@ -321,9 +279,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: movapd %xmm1, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pxor %xmm2, %xmm3 ; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 @@ -334,19 +292,16 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE42-LABEL: test_v4i64: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE42-NEXT: pxor %xmm3, %xmm4 -; X64-SSE42-NEXT: pxor %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: movapd %xmm1, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE42-NEXT: pxor %xmm0, %xmm3 +; X64-SSE42-NEXT: pxor %xmm2, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovaq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v4i64: @@ -358,11 +313,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -374,15 +329,13 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX1-NEXT: # xmm2 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovaq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -394,11 +347,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -409,15 +362,13 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovaq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -452,56 +403,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm4 ; X86-SSE2-NEXT: pxor %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm5, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm7 -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm5, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pandn %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pandn %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm4, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 @@ -582,60 +533,60 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE41-NEXT: andl $-16, %esp ; X86-SSE41-NEXT: subl $16, %esp ; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm4 +; X86-SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE41-NEXT: pxor %xmm5, %xmm6 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm5 -; X86-SSE41-NEXT: pxor %xmm4, %xmm5 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm7 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: por %xmm7, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE41-NEXT: pxor %xmm4, %xmm3 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE41-NEXT: pxor %xmm5, %xmm1 +; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm3 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE41-NEXT: movapd %xmm5, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm2, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; X86-SSE41-NEXT: movapd %xmm4, %xmm1 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; X86-SSE41-NEXT: movapd %xmm2, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm3 +; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm1, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; X86-SSE41-NEXT: movapd %xmm4, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; X86-SSE41-NEXT: pxor %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm0, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -674,31 +625,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X64-SSE41-NEXT: por %xmm6, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; X64-SSE41-NEXT: movapd %xmm2, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm2, %xmm5 +; X64-SSE41-NEXT: movapd %xmm5, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm5 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm4, %xmm1 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm1, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE41-NEXT: movapd %xmm3, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm5 -; X64-SSE41-NEXT: movapd %xmm0, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm4 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm4, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm5, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE41-NEXT: movq %xmm3, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovaq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v8i64: @@ -710,26 +650,27 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm4 ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm6 ; X86-SSE42-NEXT: pxor %xmm5, %xmm6 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE42-NEXT: movdqa %xmm4, %xmm3 -; X86-SSE42-NEXT: pxor %xmm5, %xmm3 ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE42-NEXT: pxor %xmm5, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; X86-SSE42-NEXT: movapd %xmm4, %xmm1 ; X86-SSE42-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE42-NEXT: pxor %xmm5, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; X86-SSE42-NEXT: movapd %xmm2, %xmm0 ; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; X86-SSE42-NEXT: movapd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 -; X86-SSE42-NEXT: pxor %xmm5, %xmm0 ; X86-SSE42-NEXT: pxor %xmm1, %xmm5 ; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm1 @@ -757,42 +698,39 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; X64-SSE42-NEXT: movapd %xmm2, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm2, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: movapd %xmm3, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm1, %xmm5 -; X64-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovaq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v8i64: ; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: # xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm4 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm5 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm5, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm3, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -816,12 +754,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovaq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -837,11 +773,11 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -856,15 +792,13 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovaq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -902,124 +836,128 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movaps %xmm2, (%esp) # 16-byte Spill -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movaps %xmm0, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm1 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm6, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm7 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: pandn %xmm6, %xmm4 -; X86-SSE2-NEXT: por %xmm1, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 -; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm7, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: pxor %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm4, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm4 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm6 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm5, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] ; X86-SSE2-NEXT: pand %xmm5, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: por %xmm0, %xmm5 -; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pxor %xmm3, %xmm0 ; X86-SSE2-NEXT: pand %xmm5, %xmm2 ; X86-SSE2-NEXT: pandn %xmm6, %xmm5 ; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm1 +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm7 +; X86-SSE2-NEXT: pandn %xmm5, %xmm4 +; X86-SSE2-NEXT: por %xmm7, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa (%esp), %xmm7 # 16-byte Reload -; X86-SSE2-NEXT: movdqa %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa (%esp), %xmm5 # 16-byte Reload +; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm6 -; X86-SSE2-NEXT: pand %xmm6, %xmm7 -; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm2 -; X86-SSE2-NEXT: pandn %xmm2, %xmm6 -; X86-SSE2-NEXT: por %xmm7, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: pand %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm5 +; X86-SSE2-NEXT: pandn %xmm5, %xmm2 +; X86-SSE2-NEXT: por %xmm6, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: por %xmm2, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm5 -; X86-SSE2-NEXT: pandn %xmm6, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm5, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 -; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm2, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm5, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 -; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: por %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm4, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: pxor %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -1028,9 +966,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -1157,32 +1095,31 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pushl %ebp ; X86-SSE41-NEXT: movl %esp, %ebp ; X86-SSE41-NEXT: andl $-16, %esp -; X86-SSE41-NEXT: subl $48, %esp -; X86-SSE41-NEXT: movaps %xmm2, (%esp) # 16-byte Spill -; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm2 -; X86-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: subl $32, %esp +; X86-SSE41-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE41-NEXT: pxor %xmm4, %xmm2 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm7 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE41-NEXT: por %xmm7, %xmm0 -; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm5 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; X86-SSE41-NEXT: movapd %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movaps %xmm0, (%esp) # 16-byte Spill +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm7 +; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm4 +; X86-SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm7, %xmm6 +; X86-SSE41-NEXT: pxor %xmm5, %xmm6 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; X86-SSE41-NEXT: pand %xmm0, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm6 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 ; X86-SSE41-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 +; X86-SSE41-NEXT: pxor %xmm5, %xmm1 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 @@ -1191,54 +1128,40 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pand %xmm0, %xmm1 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm7 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE41-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: movapd %xmm4, %xmm1 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; X86-SSE41-NEXT: movapd %xmm6, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm1, %xmm7 ; X86-SSE41-NEXT: movdqa 56(%ebp), %xmm1 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE41-NEXT: por %xmm7, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa (%esp), %xmm6 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm6, %xmm2 -; X86-SSE41-NEXT: pxor %xmm4, %xmm2 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm7, %xmm2 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm2 -; X86-SSE41-NEXT: movapd %xmm2, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE41-NEXT: pxor %xmm5, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X86-SSE41-NEXT: movapd %xmm5, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm6, %xmm2 -; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE41-NEXT: pxor %xmm5, %xmm0 +; X86-SSE41-NEXT: movdqa (%esp), %xmm7 # 16-byte Reload +; X86-SSE41-NEXT: movdqa %xmm7, %xmm2 +; X86-SSE41-NEXT: pxor %xmm5, %xmm2 ; X86-SSE41-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 @@ -1247,33 +1170,46 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pand %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; X86-SSE41-NEXT: movapd %xmm5, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 ; X86-SSE41-NEXT: movapd %xmm1, %xmm2 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm2 -; X86-SSE41-NEXT: movapd %xmm2, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm2 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; X86-SSE41-NEXT: movapd %xmm6, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm3 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm7 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; X86-SSE41-NEXT: por %xmm7, %xmm0 +; X86-SSE41-NEXT: movapd %xmm4, %xmm2 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; X86-SSE41-NEXT: movapd %xmm1, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm3 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; X86-SSE41-NEXT: movapd %xmm4, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; X86-SSE41-NEXT: pxor %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm0, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -1364,31 +1300,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; X64-SSE41-NEXT: movapd %xmm6, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm6, %xmm9 +; X64-SSE41-NEXT: movapd %xmm9, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm1, %xmm9 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm2, %xmm1 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm1, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE41-NEXT: movapd %xmm7, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm9 -; X64-SSE41-NEXT: movapd %xmm0, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm9, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm3 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE41-NEXT: movq %xmm7, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovaq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v16i64: @@ -1399,60 +1324,59 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE42-NEXT: subl $16, %esp ; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm7 +; X86-SSE42-NEXT: movdqa 72(%ebp), %xmm4 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm4, %xmm6 +; X86-SSE42-NEXT: pxor %xmm5, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X86-SSE42-NEXT: movdqa 40(%ebp), %xmm6 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: movdqa %xmm6, %xmm5 -; X86-SSE42-NEXT: pxor %xmm4, %xmm5 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE42-NEXT: pxor %xmm5, %xmm7 ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 -; X86-SSE42-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; X86-SSE42-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE42-NEXT: pxor %xmm4, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa 24(%ebp), %xmm7 -; X86-SSE42-NEXT: movapd 8(%ebp), %xmm1 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE42-NEXT: movdqa %xmm7, %xmm1 -; X86-SSE42-NEXT: pxor %xmm4, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: movapd %xmm4, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm1 +; X86-SSE42-NEXT: movapd %xmm6, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: movdqa 56(%ebp), %xmm1 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE42-NEXT: pxor %xmm4, %xmm3 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE42-NEXT: pxor %xmm5, %xmm6 ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: movdqa 24(%ebp), %xmm6 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: movapd %xmm1, %xmm2 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm2 -; X86-SSE42-NEXT: movapd %xmm7, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm6, %xmm2 +; X86-SSE42-NEXT: pxor %xmm5, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X86-SSE42-NEXT: movapd %xmm5, %xmm2 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm2 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; X86-SSE42-NEXT: movapd %xmm1, %xmm2 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm2 ; X86-SSE42-NEXT: movapd %xmm6, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; X86-SSE42-NEXT: movapd %xmm5, %xmm2 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm2 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; X86-SSE42-NEXT: movapd %xmm4, %xmm2 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm2 ; X86-SSE42-NEXT: movapd %xmm1, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm4 -; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; X86-SSE42-NEXT: movapd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; X86-SSE42-NEXT: pxor %xmm1, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; X86-SSE42-NEXT: movd %xmm1, %eax ; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE42-NEXT: movl %ebp, %esp @@ -1501,17 +1425,14 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE42-NEXT: xorpd %xmm9, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; X64-SSE42-NEXT: movapd %xmm6, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm6, %xmm9 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm9 +; X64-SSE42-NEXT: movdqa %xmm9, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE42-NEXT: movapd %xmm7, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm1, %xmm9 -; X64-SSE42-NEXT: pcmpgtq %xmm9, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE42-NEXT: movq %xmm7, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovaq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v16i64: @@ -1519,47 +1440,51 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-AVX1-NEXT: pushl %ebp ; X86-AVX1-NEXT: movl %esp, %ebp ; X86-AVX1-NEXT: andl $-32, %esp -; X86-AVX1-NEXT: subl $32, %esp -; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; X86-AVX1-NEXT: subl $96, %esp +; X86-AVX1-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX1-NEXT: vmovaps %ymm0, (%esp) # 32-byte Spill ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: # xmm3 = mem[0,0] +; X86-AVX1-NEXT: vmovaps 24(%ebp), %xmm4 ; X86-AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm5 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; X86-AVX1-NEXT: vxorps %xmm3, %xmm6, %xmm7 ; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 ; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm4, %xmm4 -; X86-AVX1-NEXT: vxorps 24(%ebp), %xmm3, %xmm6 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm7, %xmm5 -; X86-AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 -; X86-AVX1-NEXT: vmovapd 24(%ebp), %xmm6 -; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm7, %xmm6, %xmm5 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm6 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm7 -; X86-AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; X86-AVX1-NEXT: vmovaps 8(%ebp), %xmm7 -; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm2, %xmm0 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm7, %xmm2 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm6 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm7, %xmm1 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm5, %xmm0 +; X86-AVX1-NEXT: vmovaps (%esp), %ymm2 # 32-byte Reload +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm7, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm6, %xmm0 +; X86-AVX1-NEXT: vblendvpd %xmm0, %xmm7, %xmm5, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm4, %xmm5 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm6 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm5, %xmm1 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm5, %xmm1 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vmovaps 8(%ebp), %xmm6 +; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm6, %xmm0 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm0 +; X86-AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm6, %xmm0 +; X86-AVX1-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm6 # 32-byte Reload +; X86-AVX1-NEXT: vxorps %xmm3, %xmm6, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm6, %xmm1 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 +; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm4, %xmm1 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: movl %ebp, %esp @@ -1603,12 +1528,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm5, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovaq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1620,28 +1543,28 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-AVX2-NEXT: subl $32, %esp ; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm4 ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm3, %ymm4, %ymm5 +; X86-AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm6 +; X86-AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 +; X86-AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; X86-AVX2-NEXT: vxorpd %ymm3, %ymm1, %ymm4 ; X86-AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm5 ; X86-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm6 ; X86-AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 ; X86-AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vpxor %ymm3, %ymm4, %ymm2 -; X86-AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm5 -; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm4, %ymm1 -; X86-AVX2-NEXT: vxorpd %ymm3, %ymm1, %ymm2 -; X86-AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm4 -; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm2 +; X86-AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm2 ; X86-AVX2-NEXT: vxorpd %xmm3, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm3 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: movl %ebp, %esp @@ -1669,12 +1592,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm4, %xmm2, %xmm3 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovaq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -2434,11 +2355,10 @@ define i16 @test_v8i16(<8 x i16> %a0) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: psubusw %xmm1, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: cmpw %ax, %cx +; SSE2-NEXT: cmoval %ecx, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: ret{{[l|q]}} ; @@ -2506,11 +2426,10 @@ define i16 @test_v16i16(<16 x i16> %a0) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: psubusw %xmm0, %xmm1 ; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psubusw %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: cmpw %ax, %cx +; SSE2-NEXT: cmoval %ecx, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: ret{{[l|q]}} ; @@ -2598,11 +2517,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X86-SSE2-NEXT: paddw %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmoval %ecx, %eax ; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -2622,11 +2540,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind { ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X64-SSE2-NEXT: paddw %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmoval %ecx, %eax ; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -2755,11 +2672,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X86-SSE2-NEXT: paddw %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X86-SSE2-NEXT: paddw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmoval %ecx, %eax ; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -2787,11 +2703,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind { ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-SSE2-NEXT: psubusw %xmm0, %xmm1 ; X64-SSE2-NEXT: paddw %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm0 -; X64-SSE2-NEXT: paddw %xmm1, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm1, %eax +; X64-SSE2-NEXT: movd %xmm1, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmoval %ecx, %eax ; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index 84315e6c60895..bdf4a88aa3918 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -19,10 +19,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE2-LABEL: test_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 @@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-LABEL: test_v2i64: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] ; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE41-NEXT: pxor %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE41-NEXT: pxor %xmm2, %xmm0 ; X86-SSE41-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 @@ -70,99 +70,67 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pextrd $1, %xmm2, %edx ; X86-SSE41-NEXT: retl ; -; X64-SSE41-LABEL: test_v2i64: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; X64-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE41-NEXT: pxor %xmm0, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm0, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm5, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: por %xmm2, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE41-NEXT: movq %xmm3, %rax -; X64-SSE41-NEXT: retq +; X64-SSE4-LABEL: test_v2i64: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE4-NEXT: movq %xmm0, %rcx +; X64-SSE4-NEXT: cmpq %rax, %rcx +; X64-SSE4-NEXT: cmovbq %rcx, %rax +; X64-SSE4-NEXT: retq ; ; X86-SSE42-LABEL: test_v2i64: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE42-NEXT: pxor %xmm0, %xmm3 -; X86-SSE42-NEXT: pxor %xmm2, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X86-SSE42-NEXT: movd %xmm2, %eax -; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE42-NEXT: pxor %xmm0, %xmm2 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movd %xmm3, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm3, %edx ; X86-SSE42-NEXT: retl ; -; X64-SSE42-LABEL: test_v2i64: -; X64-SSE42: # %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE42-NEXT: pxor %xmm0, %xmm2 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE42-NEXT: movq %xmm3, %rax -; X64-SSE42-NEXT: retq -; ; X86-AVX1-LABEL: test_v2i64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX1-NEXT: # xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: # xmm1 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: retl ; ; X64-AVX1-LABEL: test_v2i64: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX1-NEXT: # xmm1 = mem[0,0] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovbq %rcx, %rax ; X64-AVX1-NEXT: retq ; ; X86-AVX2-LABEL: test_v2i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: test_v2i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovbq %rcx, %rax ; X64-AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: @@ -203,9 +171,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3 ; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 @@ -263,9 +231,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: por %xmm4, %xmm0 ; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE41-NEXT: movapd %xmm1, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm3, %xmm0 ; X86-SSE41-NEXT: pxor %xmm2, %xmm3 ; X86-SSE41-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -283,33 +251,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE41-LABEL: test_v4i64: ; X64-SSE41: # %bb.0: ; X64-SSE41-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; X64-SSE41-NEXT: pxor %xmm3, %xmm0 -; X64-SSE41-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE41-NEXT: pxor %xmm3, %xmm4 -; X64-SSE41-NEXT: movdqa %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE41-NEXT: movapd %xmm1, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm2, %xmm3 -; X64-SSE41-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; X64-SSE41-NEXT: pmovsxdq %xmm4, %xmm5 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; X64-SSE41-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE41-NEXT: pxor %xmm0, %xmm3 +; X64-SSE41-NEXT: pxor %xmm1, %xmm0 +; X64-SSE41-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: pand %xmm5, %xmm3 ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; X64-SSE41-NEXT: por %xmm3, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE41-NEXT: movq %xmm2, %rax +; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE41-NEXT: movq %xmm1, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovbq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v4i64: @@ -322,9 +280,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-SSE42-NEXT: pxor %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movapd %xmm1, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm2, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm2, %xmm0 ; X86-SSE42-NEXT: pxor %xmm3, %xmm2 ; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm2 ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 @@ -336,21 +294,16 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-SSE42-LABEL: test_v4i64: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 -; X64-SSE42-NEXT: pxor %xmm3, %xmm4 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: pxor %xmm3, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE42-NEXT: pxor %xmm0, %xmm3 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X64-SSE42-NEXT: movapd %xmm1, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm3, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm2, %xmm3 -; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 -; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE42-NEXT: movq %xmm1, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovbq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v4i64: @@ -362,11 +315,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -378,31 +331,29 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX1-NEXT: # xmm1 = mem[0,0] ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovbq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; ; X86-AVX2-LABEL: test_v4i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm4 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -413,15 +364,13 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovbq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -456,56 +405,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm6, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: pandn %xmm5, %xmm4 -; X86-SSE2-NEXT: por %xmm1, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm3, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm5, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm4, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pxor %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 @@ -586,60 +535,59 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE41-NEXT: andl $-16, %esp ; X86-SSE41-NEXT: subl $16, %esp ; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm4 -; X86-SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pxor %xmm5, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm4, %xmm6 -; X86-SSE41-NEXT: pxor %xmm5, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm6 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; X86-SSE41-NEXT: por %xmm6, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE41-NEXT: pxor %xmm5, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor %xmm5, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE41-NEXT: pxor %xmm4, %xmm5 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm6 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm7 +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm7, %xmm0 ; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE41-NEXT: movapd %xmm2, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X86-SSE41-NEXT: movapd %xmm4, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm5, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm4, %xmm0 -; X86-SSE41-NEXT: pxor %xmm5, %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm5 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm3 +; X86-SSE41-NEXT: pxor %xmm4, %xmm3 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm3 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: por %xmm3, %xmm0 +; X86-SSE41-NEXT: movapd %xmm2, %xmm3 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm3 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm3, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; X86-SSE41-NEXT: pxor %xmm1, %xmm4 +; X86-SSE41-NEXT: movdqa %xmm4, %xmm2 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm3, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -677,31 +625,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X64-SSE41-NEXT: por %xmm6, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE41-NEXT: movapd %xmm3, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm1 -; X64-SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm6, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm3, %xmm5 +; X64-SSE41-NEXT: movapd %xmm5, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm1, %xmm4 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm4, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE41-NEXT: movapd %xmm3, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm5, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm5 -; X64-SSE41-NEXT: movdqa %xmm5, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm4 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm4, %xmm5 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm5, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE41-NEXT: movq %xmm3, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovbq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v8i64: @@ -710,33 +647,33 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-SSE42-NEXT: movl %esp, %ebp ; X86-SSE42-NEXT: andl $-16, %esp ; X86-SSE42-NEXT: subl $16, %esp -; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm5 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE42-NEXT: pxor %xmm3, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: movapd %xmm2, %xmm4 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm4 ; X86-SSE42-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE42-NEXT: pxor %xmm4, %xmm6 +; X86-SSE42-NEXT: pxor %xmm3, %xmm6 ; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE42-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE42-NEXT: pxor %xmm4, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE42-NEXT: movapd %xmm2, %xmm1 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 ; X86-SSE42-NEXT: movapd %xmm5, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; X86-SSE42-NEXT: movapd %xmm5, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm4 -; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 -; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; X86-SSE42-NEXT: movd %xmm1, %eax ; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx @@ -746,59 +683,55 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; ; X64-SSE42-LABEL: test_v8i64: ; X64-SSE42: # %bb.0: -; X64-SSE42-NEXT: movdqa %xmm0, %xmm5 -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] ; X64-SSE42-NEXT: movdqa %xmm0, %xmm6 -; X64-SSE42-NEXT: pxor %xmm4, %xmm6 +; X64-SSE42-NEXT: pxor %xmm5, %xmm6 ; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; X64-SSE42-NEXT: movapd %xmm2, %xmm5 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm5 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movapd %xmm2, %xmm4 +; X64-SSE42-NEXT: xorpd %xmm5, %xmm4 ; X64-SSE42-NEXT: movdqa %xmm1, %xmm6 -; X64-SSE42-NEXT: pxor %xmm4, %xmm6 +; X64-SSE42-NEXT: pxor %xmm5, %xmm6 ; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm5, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X64-SSE42-NEXT: movapd %xmm3, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm3, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; X64-SSE42-NEXT: movapd %xmm3, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm1, %xmm4 -; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 -; X64-SSE42-NEXT: movdqa %xmm4, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: pextrq $1, %xmm3, %rax +; X64-SSE42-NEXT: movq %xmm3, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovbq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v8i64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: # xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm5, %xmm6 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm5 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm5 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -822,12 +755,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovbq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -838,16 +769,16 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm4 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -862,15 +793,13 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm3, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovbq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -907,127 +836,124 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp -; X86-SSE2-NEXT: subl $48, %esp -; X86-SSE2-NEXT: movaps %xmm1, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pxor %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm6 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm2 -; X86-SSE2-NEXT: pandn %xmm5, %xmm4 -; X86-SSE2-NEXT: por %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm0 +; X86-SSE2-NEXT: pandn %xmm0, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm5, %xmm0 -; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm7 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm6, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 ; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm7, %xmm2 +; X86-SSE2-NEXT: pandn %xmm6, %xmm7 +; X86-SSE2-NEXT: por %xmm2, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 ; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm6 -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pandn %xmm5, %xmm0 -; X86-SSE2-NEXT: por %xmm7, %xmm0 -; X86-SSE2-NEXT: movdqa (%esp), %xmm4 # 16-byte Reload +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm7, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm5 -; X86-SSE2-NEXT: pxor %xmm3, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm2 +; X86-SSE2-NEXT: pxor %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 ; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm1, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm4 -; X86-SSE2-NEXT: pandn %xmm6, %xmm5 -; X86-SSE2-NEXT: por %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm6, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 ; X86-SSE2-NEXT: pxor %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm6 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; X86-SSE2-NEXT: por %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm4 ; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm7, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: pxor %xmm3, %xmm5 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm7 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm7, %xmm5 -; X86-SSE2-NEXT: pand %xmm5, %xmm2 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 -; X86-SSE2-NEXT: por %xmm2, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE2-NEXT: pxor %xmm3, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm5, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pxor %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm4, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm0, %xmm5 -; X86-SSE2-NEXT: pandn %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm5, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pxor %xmm3, %xmm2 -; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm0, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -1036,9 +962,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm2 -; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -1166,29 +1092,26 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: movl %esp, %ebp ; X86-SSE41-NEXT: andl $-16, %esp ; X86-SSE41-NEXT: subl $48, %esp -; X86-SSE41-NEXT: movaps %xmm1, (%esp) # 16-byte Spill +; X86-SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-SSE41-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE41-NEXT: movdqa 56(%ebp), %xmm1 +; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm6 ; X86-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE41-NEXT: pxor %xmm4, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm5 +; X86-SSE41-NEXT: pxor %xmm4, %xmm5 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE41-NEXT: movdqa 56(%ebp), %xmm5 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE41-NEXT: movapd %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE41-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm1 ; X86-SSE41-NEXT: pxor %xmm4, %xmm1 ; X86-SSE41-NEXT: movdqa %xmm1, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 @@ -1196,94 +1119,93 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm2 +; X86-SSE41-NEXT: movdqa 72(%ebp), %xmm3 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm7 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; X86-SSE41-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm3 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 -; X86-SSE41-NEXT: movdqa (%esp), %xmm5 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE41-NEXT: movapd %xmm6, %xmm1 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm7 +; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm7 +; X86-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm1, %xmm2 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; X86-SSE41-NEXT: movapd %xmm5, (%esp) # 16-byte Spill +; X86-SSE41-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE41-NEXT: pxor %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE41-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE41-NEXT: movdqa %xmm5, %xmm2 +; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqa 8(%ebp), %xmm6 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; X86-SSE41-NEXT: movapd %xmm3, %xmm0 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm2, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm7 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; X86-SSE41-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: movdqa %xmm2, %xmm7 ; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; X86-SSE41-NEXT: movapd %xmm6, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: movapd %xmm5, %xmm2 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; X86-SSE41-NEXT: movapd %xmm3, %xmm0 ; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; X86-SSE41-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE41-NEXT: pxor %xmm4, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm0 -; X86-SSE41-NEXT: movapd %xmm1, %xmm5 +; X86-SSE41-NEXT: movapd %xmm0, %xmm6 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm7 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; X86-SSE41-NEXT: por %xmm7, %xmm0 +; X86-SSE41-NEXT: movdqa (%esp), %xmm7 # 16-byte Reload +; X86-SSE41-NEXT: movdqa %xmm7, %xmm2 +; X86-SSE41-NEXT: pxor %xmm4, %xmm2 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; X86-SSE41-NEXT: movapd %xmm3, %xmm0 ; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 -; X86-SSE41-NEXT: movapd %xmm2, %xmm1 -; X86-SSE41-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE41-NEXT: movapd %xmm1, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE41-NEXT: pand %xmm0, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE41-NEXT: pxor %xmm4, %xmm0 +; X86-SSE41-NEXT: movapd %xmm0, %xmm1 +; X86-SSE41-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: pand %xmm2, %xmm6 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: por %xmm6, %xmm0 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 +; X86-SSE41-NEXT: movapd %xmm3, %xmm0 +; X86-SSE41-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: movdqa %xmm4, %xmm3 -; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE41-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 ; X86-SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; X86-SSE41-NEXT: pmovsxdq %xmm3, %xmm0 +; X86-SSE41-NEXT: pmovsxdq %xmm2, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; X86-SSE41-NEXT: pand %xmm0, %xmm4 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: por %xmm4, %xmm0 -; X86-SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; X86-SSE41-NEXT: movd %xmm1, %eax ; X86-SSE41-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE41-NEXT: movl %ebp, %esp @@ -1373,31 +1295,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X64-SSE41-NEXT: por %xmm1, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; X64-SSE41-NEXT: movapd %xmm7, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: movapd %xmm0, %xmm1 -; X64-SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm2 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE41-NEXT: xorpd %xmm7, %xmm9 +; X64-SSE41-NEXT: movapd %xmm9, %xmm0 +; X64-SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X64-SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; X64-SSE41-NEXT: pand %xmm1, %xmm2 +; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE41-NEXT: movapd %xmm7, %xmm0 -; X64-SSE41-NEXT: xorpd %xmm9, %xmm0 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE41-NEXT: pxor %xmm1, %xmm9 -; X64-SSE41-NEXT: movdqa %xmm9, %xmm2 -; X64-SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-SSE41-NEXT: pmovsxdq %xmm2, %xmm3 -; X64-SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3] -; X64-SSE41-NEXT: pand %xmm3, %xmm4 -; X64-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; X64-SSE41-NEXT: por %xmm4, %xmm0 -; X64-SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE41-NEXT: movq %xmm7, %rcx +; X64-SSE41-NEXT: cmpq %rax, %rcx +; X64-SSE41-NEXT: cmovbq %rcx, %rax ; X64-SSE41-NEXT: retq ; ; X86-SSE42-LABEL: test_v16i64: @@ -1405,65 +1316,63 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE42-NEXT: pushl %ebp ; X86-SSE42-NEXT: movl %esp, %ebp ; X86-SSE42-NEXT: andl $-16, %esp -; X86-SSE42-NEXT: subl $32, %esp -; X86-SSE42-NEXT: movaps %xmm1, (%esp) # 16-byte Spill -; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE42-NEXT: movdqa 72(%ebp), %xmm5 -; X86-SSE42-NEXT: movdqa 56(%ebp), %xmm6 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] -; X86-SSE42-NEXT: movdqa %xmm2, %xmm7 -; X86-SSE42-NEXT: pxor %xmm4, %xmm7 +; X86-SSE42-NEXT: subl $16, %esp +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE42-NEXT: pxor %xmm3, %xmm5 ; X86-SSE42-NEXT: movdqa %xmm6, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 -; X86-SSE42-NEXT: movdqa 24(%ebp), %xmm7 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; X86-SSE42-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE42-NEXT: pxor %xmm4, %xmm2 -; X86-SSE42-NEXT: movdqa %xmm7, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm1 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE42-NEXT: pxor %xmm4, %xmm2 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm5, %xmm0 +; X86-SSE42-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm4 ; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: movdqa 72(%ebp), %xmm4 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; X86-SSE42-NEXT: movapd %xmm6, %xmm2 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm2 +; X86-SSE42-NEXT: movapd %xmm5, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; X86-SSE42-NEXT: movdqa 40(%ebp), %xmm2 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; X86-SSE42-NEXT: movdqa (%esp), %xmm1 # 16-byte Reload -; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE42-NEXT: pxor %xmm4, %xmm3 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE42-NEXT: pxor %xmm3, %xmm6 ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: movdqa 8(%ebp), %xmm6 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X86-SSE42-NEXT: movapd %xmm2, %xmm1 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE42-NEXT: movapd %xmm5, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE42-NEXT: pxor %xmm3, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; X86-SSE42-NEXT: movapd %xmm7, %xmm1 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE42-NEXT: movapd %xmm6, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; X86-SSE42-NEXT: movapd %xmm2, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm1 +; X86-SSE42-NEXT: movapd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm6 -; X86-SSE42-NEXT: movapd %xmm6, %xmm1 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 -; X86-SSE42-NEXT: movapd %xmm5, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; X86-SSE42-NEXT: movapd %xmm5, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm1 +; X86-SSE42-NEXT: movapd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm4 -; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 -; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; X86-SSE42-NEXT: movapd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm3, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; X86-SSE42-NEXT: pxor %xmm1, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; X86-SSE42-NEXT: movd %xmm1, %eax ; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx ; X86-SSE42-NEXT: movl %ebp, %esp @@ -1512,18 +1421,14 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-SSE42-NEXT: xorpd %xmm8, %xmm0 ; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; X64-SSE42-NEXT: movapd %xmm7, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm8, %xmm0 -; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; X64-SSE42-NEXT: movapd %xmm7, %xmm0 -; X64-SSE42-NEXT: xorpd %xmm8, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; X64-SSE42-NEXT: pxor %xmm1, %xmm8 -; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm8 +; X64-SSE42-NEXT: xorpd %xmm7, %xmm8 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm8 ; X64-SSE42-NEXT: movdqa %xmm8, %xmm0 -; X64-SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; X64-SSE42-NEXT: pextrq $1, %xmm7, %rax +; X64-SSE42-NEXT: movq %xmm7, %rcx +; X64-SSE42-NEXT: cmpq %rax, %rcx +; X64-SSE42-NEXT: cmovbq %rcx, %rax ; X64-SSE42-NEXT: retq ; ; X86-AVX1-LABEL: test_v16i64: @@ -1534,44 +1439,44 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-AVX1-NEXT: subl $32, %esp ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: # xmm3 = mem[0,0] -; X86-AVX1-NEXT: vmovaps 8(%ebp), %xmm4 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm6 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; X86-AVX1-NEXT: vmovaps 8(%ebp), %xmm5 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm5, %xmm6 ; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm7 ; X86-AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6 -; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm4 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm6 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm7 -; X86-AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; X86-AVX1-NEXT: vxorps 24(%ebp), %xmm3, %xmm7 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm5 -; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 -; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm2, %xmm6 -; X86-AVX1-NEXT: vmovapd 24(%ebp), %xmm7 -; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm4, %xmm7 +; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm5 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6 +; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm5 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm7 -; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm6 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vmovaps 24(%ebp), %xmm6 ; X86-AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm6, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm2 +; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm6, %xmm1 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm5 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm6, %xmm1 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm4, %xmm1 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm4, %xmm1 ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: movl %ebp, %esp @@ -1615,12 +1520,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm5, %xmm0, %xmm0 -; X64-AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX1-NEXT: vmovq %xmm0, %rcx +; X64-AVX1-NEXT: cmpq %rax, %rcx +; X64-AVX1-NEXT: cmovbq %rcx, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1632,28 +1535,28 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-AVX2-NEXT: subl $32, %esp ; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm4 ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm5 +; X86-AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm6 +; X86-AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 +; X86-AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm2 ; X86-AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm5 ; X86-AVX2-NEXT: vpxor %ymm3, %ymm4, %ymm6 ; X86-AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 ; X86-AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; X86-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 -; X86-AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm5 -; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; X86-AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vxorpd %ymm3, %ymm0, %ymm2 ; X86-AVX2-NEXT: vxorpd %ymm3, %ymm1, %ymm4 ; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm3 -; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vxorpd %xmm3, %xmm2, %xmm4 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: movl %ebp, %esp @@ -1681,12 +1584,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vxorpd %xmm4, %xmm2, %xmm3 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm1 -; X64-AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vxorpd %xmm4, %xmm2, %xmm3 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX2-NEXT: vmovq %xmm0, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: cmovbq %rcx, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -2451,12 +2352,10 @@ define i16 @test_v8i16(<8 x i16> %a0) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubusw %xmm1, %xmm2 ; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: cmpw %ax, %cx +; SSE2-NEXT: cmovbl %ecx, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: ret{{[l|q]}} ; @@ -2491,12 +2390,10 @@ define i16 @test_v16i16(<16 x i16> %a0) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubusw %xmm1, %xmm2 ; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 -; SSE2-NEXT: psubw %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: cmpw %ax, %cx +; SSE2-NEXT: cmovbl %ecx, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: ret{{[l|q]}} ; @@ -2565,12 +2462,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmovbl %ecx, %eax ; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -2595,12 +2490,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind { ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmovbl %ecx, %eax ; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; @@ -2706,12 +2599,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X86-SSE2-NEXT: psubw %xmm2, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: cmpw %ax, %cx +; X86-SSE2-NEXT: cmovbl %ecx, %eax ; X86-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -2748,12 +2639,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind { ; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 ; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: psubusw %xmm1, %xmm2 -; X64-SSE2-NEXT: psubw %xmm2, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pextrw $1, %xmm0, %eax +; X64-SSE2-NEXT: movd %xmm0, %ecx +; X64-SSE2-NEXT: cmpw %ax, %cx +; X64-SSE2-NEXT: cmovbl %ecx, %eax ; X64-SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s index 07b7639efafe5..a3a6f9b198db2 100644 --- a/llvm/test/MC/AMDGPU/literals.s +++ b/llvm/test/MC/AMDGPU/literals.s @@ -494,7 +494,7 @@ v_pk_add_u16 v5, exec_lo, lit(1.0) // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x00,0x00,0x80,0x3f] // GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x00,0x00,0x80,0x3f] // NOCI: :[[@LINE-3]]:1: error: instruction not supported on this GPU (bonaire): v_pk_add_u16 -// NOGFX9: :[[@LINE-4]]:31: error: invalid operand (violates constant bus restrictions) +// NOGFX9: :[[@LINE-4]]:31: error: literal operands are not supported // NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU (tahiti): v_pk_add_u16 // NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU (tonga): v_pk_add_u16 @@ -976,7 +976,7 @@ v_pk_add_u16 v5, exec_lo, lit(1) // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x01,0x00,0x00,0x00] // GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x01,0x00,0x00,0x00] // NOCI: :[[@LINE-3]]:1: error: instruction not supported on this GPU (bonaire): v_pk_add_u16 -// NOGFX9: :[[@LINE-4]]:31: error: invalid operand (violates constant bus restrictions) +// NOGFX9: :[[@LINE-4]]:31: error: literal operands are not supported // NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU (tahiti): v_pk_add_u16 // NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU (tonga): v_pk_add_u16 @@ -1982,14 +1982,8 @@ v_add_nc_u64 v[0:1], v[0:1], lit64(1) // NOVI: :[[@LINE-7]]:1: error: instruction not supported on this GPU (tonga): v_add_nc_u64 v_add_f64 v[0:1], v[0:1], lit(1) -// GFX11: v_add_f64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0x27,0xd7,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00] -// GFX12: v_add_f64_e64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00] -// GFX1250-ASM: v_add_f64_e64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00] -// GFX1250-DIS: v_add_f64_e64 v[0:1], v[0:1], 0x1 ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00] -// GFX89: v_add_f64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0x80,0xd2,0x00,0xff,0x01,0x00] -// SICI: v_add_f64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0xc8,0xd2,0x00,0xff,0x01,0x00] +// NOGCN: :[[@LINE-1]]:31: error: invalid operand for instruction -// FIXME: Forced lit() encoding is not preserved after disasm v_add_f64 v[0:1], v[0:1], lit(1.0) // NOGCN: :[[@LINE-1]]:31: error: invalid operand for instruction diff --git a/llvm/test/MachineVerifier/AMDGPU/lit64.mir b/llvm/test/MachineVerifier/AMDGPU/lit64.mir new file mode 100644 index 0000000000000..acee2ae2a1aa5 --- /dev/null +++ b/llvm/test/MachineVerifier/AMDGPU/lit64.mir @@ -0,0 +1,9 @@ +# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -run-pass=none -o - %s 2>&1 | FileCheck %s + +--- + +name: lit64 +body: | + bb.0: + ; CHECK: illegal 64-bit immediate value for operand. + $vgpr0_vgpr1 = V_ADD_F64_e64 0, 68719476721, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode diff --git a/llvm/test/TableGen/sort.td b/llvm/test/TableGen/sort.td new file mode 100644 index 0000000000000..29107a7e465a1 --- /dev/null +++ b/llvm/test/TableGen/sort.td @@ -0,0 +1,71 @@ +// RUN: llvm-tblgen %s | FileCheck %s +// RUN: not llvm-tblgen -DERROR_NONLIST %s 2>&1 | FileCheck --check-prefix=ERROR_NONLIST %s +// RUN: not llvm-tblgen -DERROR_KEYTYPE %s 2>&1 | FileCheck --check-prefix=ERROR_KEYTYPE %s + +// Sort an already-sorted list — should be a no-op. +// CHECK-LABEL: def idempotent +// CHECK: already = ["a", "b", "c"]; +def idempotent { + list already = !sort(item, ["a", "b", "c"], item); +} + +// Sort records by a string field or an int field. The sort is stable, so order +// is preserved on elements with equal key. +// CHECK-LABEL: def key_expr +// CHECK: by_name = [thing_a20, thing_a10, thing_b20, thing_c30, thing_d10]; +// CHECK: by_value = [thing_a10, thing_d10, thing_a20, thing_b20, thing_c30]; +class Thing { + string Name = N; + int Value = V; +} +def thing_a10 : Thing<"alpha", 10>; +def thing_a20 : Thing<"alpha", 20>; +def thing_b20 : Thing<"beta", 20>; +def thing_c30 : Thing<"charlie", 30>; +def thing_d10 : Thing<"delta", 10>; +defvar Things = [thing_c30, thing_a20, thing_a10, thing_b20, thing_d10]; +def key_expr { + list by_name = !sort(t, Things, t.Name); + list by_value = !sort(t, Things, t.Value); +} + +// Sort a single-element list. +// CHECK-LABEL: def single +// CHECK: one = ["only"]; +def single { + list one = !sort(item, ["only"], item); +} + +// CHECK-LABEL: def sorted +// CHECK: sorted_strings = ["axolotl", "barracuda", "cephalopod", "dragonfly"]; +// CHECK: sorted_ints = [1, 2, 3, 5, 8]; +// CHECK: sorted_empty_strings = []; +// CHECK: sorted_empty_ints = []; +defvar EmptyStrings = []; +defvar Creatures = ["cephalopod", "axolotl", "dragonfly", "barracuda"]; +defvar EmptyInts = []; +defvar Nums = [5, 2, 8, 1, 3]; +def sorted { + list sorted_strings = !sort(item, Creatures, item); + list sorted_ints = !sort(n, Nums, n); + list sorted_empty_strings = !sort(item, EmptyStrings, item); + list sorted_empty_ints = !sort(n, EmptyInts, n); +} + +#ifdef ERROR_NONLIST +// Dag is not a valid second argument. +def myop; +defvar mydag = (myop); +def err_nonlist { + // ERROR_NONLIST: sort.td:[[@LINE+1]]:38: error: !sort must have a list argument + list bad = !sort(x, mydag, !cast(x)); +} +#endif + +#ifdef ERROR_KEYTYPE +// Key that cannot be resolved to int or string leaves the op unfolded. +// ERROR_KEYTYPE: sort.td:[[@LINE+1]]:5: error: Initializer of 'bad' in 'err_keytype' could not be fully resolved +def err_keytype { + list bad = !sort(t, Things, t); +} +#endif diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll index 8c8348cf5700f..4bb6bf39413d0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll @@ -60,8 +60,6 @@ exit: ret i64 %res } -; The scalar cost of this loop must include the freeze's cost, otherwise VF=2 -; is incorrectly rejected as unprofitable. define i32 @or_reduction_with_freeze(ptr %dst, ptr %src) { ; CHECK-LABEL: define i32 @or_reduction_with_freeze( ; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) { @@ -75,7 +73,7 @@ define i32 @or_reduction_with_freeze(ptr %dst, ptr %src) { ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DST6]], [[SRC7]] ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 18 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 10 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; CHECK: [[VECTOR_SCEVCHECK]]: ; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[DST1]] to i3 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll index 89a69e64e0a88..00d73a7ab6825 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -1,9 +1,9 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 ; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -; CHECK-LABEL: @reduction_i8 ; ; char reduction_i8(char *a, char *b, int n) { ; char sum = 0; @@ -12,18 +12,69 @@ target triple = "aarch64--linux-gnu" ; return sum; ; } ; -; CHECK: vector.body: -; CHECK: phi <16 x i8> -; CHECK: load <16 x i8> -; CHECK: load <16 x i8> -; CHECK: add <16 x i8> -; CHECK: add <16 x i8> -; -; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> -; CHECK: zext i8 [[Rdx]] to i32 -; define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) { +; CHECK-LABEL: define i8 @reduction_i8( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP_12:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_12]], label %[[ITER_CHECK:.*]], [[FOR_COND_CLEANUP:label %.*]] +; CHECK: [[ITER_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP0]], 12 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483632 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i8> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP4]]) +; CHECK-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[TMP0]], 2147483644 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> , i32 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = trunc nuw <4 x i32> [[TMP8]] to <4 x i8> +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i8> [ [[TMP9]], %[[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]] +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX5]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i8> [[VEC_PHI6]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP13]] = add <4 x i8> [[TMP12]], [[WIDE_LOAD8]] +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP15:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = zext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[N_VEC4]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N10]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; entry: %cmp.12 = icmp sgt i32 %n, 0 br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup @@ -58,7 +109,6 @@ for.body: br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body } -; CHECK-LABEL: @reduction_i16_1 ; ; short reduction_i16_1(short *a, short *b, int n) { ; short sum = 0; @@ -67,18 +117,38 @@ for.body: ; return sum; ; } ; -; CHECK: vector.body: -; CHECK: phi <8 x i16> -; CHECK: load <8 x i16> -; CHECK: load <8 x i16> -; CHECK: add <8 x i16> -; CHECK: add <8 x i16> -; -; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> -; CHECK: zext i16 [[Rdx]] to i32 -; define i16 @reduction_i16_1(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) { +; CHECK-LABEL: define i16 @reduction_i16_1( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP_16:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_16]], label %[[FOR_BODY_PREHEADER:.*]], [[FOR_COND_CLEANUP:label %.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2 x i8], ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8], ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = add <8 x i16> [[TMP3]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]]) +; CHECK-NEXT: [[TMP7:%.*]] = zext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; entry: %cmp.16 = icmp sgt i32 %n, 0 br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup @@ -113,7 +183,6 @@ for.body: br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body } -; CHECK-LABEL: @reduction_i16_2 ; ; short reduction_i16_2(char *a, char *b, int n) { ; short sum = 0; @@ -122,20 +191,74 @@ for.body: ; return sum; ; } ; -; CHECK: vector.body: -; CHECK: phi <16 x i16> -; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> -; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> -; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> -; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> -; CHECK: add <16 x i16> -; CHECK: add <16 x i16> -; -; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> -; CHECK: zext i16 [[Rdx]] to i32 -; define i16 @reduction_i16_2(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) { +; CHECK-LABEL: define i16 @reduction_i16_2( +; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP_14:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP_14]], label %[[ITER_CHECK:.*]], [[FOR_COND_CLEANUP:label %.*]] +; CHECK: [[ITER_CHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP0]], 12 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483632 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i16> [[VEC_PHI]], [[TMP2]] +; CHECK-NEXT: [[TMP6]] = add <16 x i16> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP6]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[TMP0]], 2147483644 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> , i32 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP10]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]] +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX5]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[WIDE_LOAD8]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[VEC_PHI6]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17]] = and <4 x i32> [[TMP16]], splat (i32 65535) +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[TMP16]] to <4 x i16> +; CHECK-NEXT: [[TMP20:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = zext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[N_VEC4]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N10]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; entry: %cmp.14 = icmp sgt i32 %n, 0 br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll index 12d32872e1453..aeb222e677e63 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll @@ -104,7 +104,7 @@ define void @test_vpinstruction_freeze_cost(ptr %src, ptr noalias %dst) { ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %g.src = getelementptr inbounds i64, ptr %src, i64 %iv ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %l = load i64, ptr %g.src, align 8 -; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %fr = freeze i64 %l +; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %fr = freeze i64 %l ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %fr, ptr %g.dst, align 8 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1 @@ -117,7 +117,7 @@ define void @test_vpinstruction_freeze_cost(ptr %src, ptr noalias %dst) { ; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]> ; CHECK: Cost of 0 for VF 2: vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%g.src> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%l> = load vp<[[VP5]]> -; CHECK: Cost of 2 for VF 2: WIDEN ir<%fr> = freeze ir<%l> +; CHECK: Cost of 0 for VF 2: WIDEN ir<%fr> = freeze ir<%l> ; CHECK: Cost of 0 for VF 2: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]> ; CHECK: Cost of 0 for VF 2: vp<[[VP6:%[0-9]+]]> = vector-pointer inbounds ir<%g.dst> ; CHECK: Cost of 1 for VF 2: WIDEN store vp<[[VP6]]>, ir<%fr> @@ -142,7 +142,7 @@ define void @test_vpinstruction_freeze_cost(ptr %src, ptr noalias %dst) { ; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]> ; CHECK: Cost of 0 for VF 4: vp<[[VP5]]> = vector-pointer inbounds ir<%g.src> ; CHECK: Cost of 1 for VF 4: WIDEN ir<%l> = load vp<[[VP5]]> -; CHECK: Cost of 2 for VF 4: WIDEN ir<%fr> = freeze ir<%l> +; CHECK: Cost of 0 for VF 4: WIDEN ir<%fr> = freeze ir<%l> ; CHECK: Cost of 0 for VF 4: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]> ; CHECK: Cost of 0 for VF 4: vp<[[VP6]]> = vector-pointer inbounds ir<%g.dst> ; CHECK: Cost of 1 for VF 4: WIDEN store vp<[[VP6]]>, ir<%fr> diff --git a/llvm/test/tools/llvm-cov/directory_coverage.linux.test b/llvm/test/tools/llvm-cov/directory_coverage.linux.test deleted file mode 100644 index 5db76c5ab833f..0000000000000 --- a/llvm/test/tools/llvm-cov/directory_coverage.linux.test +++ /dev/null @@ -1,53 +0,0 @@ -# REQUIRES: system-linux -# RUN: mkdir -p %t - -# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \ -# RUN: --instr-profile %S/Inputs/directory_coverage/main.profdata \ -# RUN: --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \ -# RUN: --format=text --show-directory-coverage -o %t/report-text - -# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \ -# RUN: --instr-profile %S/Inputs/directory_coverage/main.profdata \ -# RUN: --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \ -# RUN: --format=html --show-directory-coverage -o %t/report-html - -# RUN: FileCheck --input-file %t/report-text/index.txt %s --check-prefix=ROOT -# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/index.txt %s --check-prefix=ROOT -# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/b0/index.txt %s --check-prefix=B0 -# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/c0/c1/index.txt %s --check-prefix=C1 - -# RUN: FileCheck --input-file %t/report-html/index.html %s --check-prefix=HTML-TOP --allow-empty -# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/index.html %s --check-prefix=ROOT -# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/b0/index.html %s --check-prefix=B0 -# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/c0/c1/index.html %s --check-prefix=C1 - - - -# HTML-TOP: coverage/index.html - -# ROOT: a0/a1/a2.cc -# ROOT: b0/ -# ROOT-NOT: b1_1.cc -# ROOT-NOT: b1_2.cc -# ROOT: c0/c1/ -# ROOT-NOT: c2_1.cc -# ROOT-NOT: b2_2.cc -# ROOT: main.cc - -# B0: b1_1.cc -# B0: b1_2.cc - -# C1: c2.h -# C1: c2_1.cc - - -For regenerating the test: - -cp -r %S/Inputs/directory_coverage /tmp -cd /tmp/directory_coverage -clang -fprofile-instr-generate -fcoverage-mapping -mllvm -enable-name-compression=false \ - -o main main.cc a0/a1/a2.cc b0/b1_1.cc b0/b1_2.cc c0/c1/c2_1.cc c0/c1/c2_2.cc -./main -llvm-cov convert-for-testing main -o main.covmapping -llvm-profdata merge default.profraw -o main.profdata -rm main default.profraw diff --git a/llvm/test/tools/llvm-cov/directory_coverage.win.test b/llvm/test/tools/llvm-cov/directory_coverage.test similarity index 53% rename from llvm/test/tools/llvm-cov/directory_coverage.win.test rename to llvm/test/tools/llvm-cov/directory_coverage.test index f948bdcae3a58..99dc0ccb82287 100644 --- a/llvm/test/tools/llvm-cov/directory_coverage.win.test +++ b/llvm/test/tools/llvm-cov/directory_coverage.test @@ -1,44 +1,52 @@ -# REQUIRES: system-windows -# RUN: mkdir -p %t - -# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \ -# RUN: --instr-profile %S/Inputs/directory_coverage/main.profdata \ -# RUN: --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \ -# RUN: --format=text --show-directory-coverage -o %t/report-text - -# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \ -# RUN: --instr-profile %S/Inputs/directory_coverage/main.profdata \ -# RUN: --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \ -# RUN: --format=html --show-directory-coverage -o %t/report-html - -# RUN: FileCheck --input-file %t/report-text/index.txt %s --check-prefix=ROOT -# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/index.txt %s --check-prefix=ROOT -# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/b0/index.txt %s --check-prefix=B0 -# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/c0/c1/index.txt %s --check-prefix=C1 - -# RUN: FileCheck --input-file %t/report-html/index.html %s --check-prefix=HTML-TOP --allow-empty -# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/index.html %s --check-prefix=ROOT -# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/b0/index.html %s --check-prefix=B0 -# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/c0/c1/index.html %s --check-prefix=C1 - - - -# HTML-TOP: coverage\index.html - -# ROOT: a0\a1\a2.cc -# ROOT: b0\ -# ROOT-NOT: b1_1.cc -# ROOT-NOT: b1_2.cc -# ROOT: c0\c1\ -# ROOT-NOT: c2_1.cc -# ROOT-NOT: b2_2.cc -# ROOT: main.cc - -# B0: b1_1.cc -# B0: b1_2.cc - -# C1: c2.h -# C1: c2_1.cc - - -The input of this test is generated on Linux. See 'directory_coverage.linux.test'. +# RUN: mkdir -p %t + +# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \ +# RUN: --instr-profile %S/Inputs/directory_coverage/main.profdata \ +# RUN: --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \ +# RUN: --format=text --show-directory-coverage -o %t/report-text + +# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \ +# RUN: --instr-profile %S/Inputs/directory_coverage/main.profdata \ +# RUN: --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \ +# RUN: --format=html --show-directory-coverage -o %t/report-html + +# RUN: FileCheck --input-file %t/report-text/index.txt %s --check-prefix=ROOT -DSEP=%{fs-sep} +# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/index.txt %s --check-prefix=ROOT -DSEP=%{fs-sep} +# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/b0/index.txt %s --check-prefix=B0 -DSEP=%{fs-sep} +# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/c0/c1/index.txt %s --check-prefix=C1 -DSEP=%{fs-sep} + +# RUN: FileCheck --input-file %t/report-html/index.html %s --check-prefix=HTML-TOP --allow-empty -DSEP=%{fs-sep} +# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/index.html %s --check-prefix=ROOT -DSEP=%{fs-sep} +# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/b0/index.html %s --check-prefix=B0 -DSEP=%{fs-sep} +# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/c0/c1/index.html %s --check-prefix=C1 -DSEP=%{fs-sep} + + + +# HTML-TOP: coverage[[SEP]]index.html + +# ROOT: a0[[SEP]]a1[[SEP]]a2.cc +# ROOT: b0[[SEP]] +# ROOT-NOT: b1_1.cc +# ROOT-NOT: b1_2.cc +# ROOT: c0[[SEP]]c1[[SEP]] +# ROOT-NOT: c2_1.cc +# ROOT-NOT: b2_2.cc +# ROOT: main.cc + +# B0: b1_1.cc +# B0: b1_2.cc + +# C1: c2.h +# C1: c2_1.cc + + +# The input of this test is generated on Linux. +# For regenerating the test: +# cp -r %S/Inputs/directory_coverage /tmp +# cd /tmp/directory_coverage +# clang -fprofile-instr-generate -fcoverage-mapping -mllvm -enable-name-compression=false \ +# -o main main.cc a0/a1/a2.cc b0/b1_1.cc b0/b1_2.cc c0/c1/c2_1.cc c0/c1/c2_2.cc +# ./main +# llvm-cov convert-for-testing main -o main.covmapping +# llvm-profdata merge default.profraw -o main.profdata +# rm main default.profraw diff --git a/llvm/test/tools/llvm-cov/native_separators.c b/llvm/test/tools/llvm-cov/native_separators.c index 3c768e1014b92..4fe305de4b13e 100644 --- a/llvm/test/tools/llvm-cov/native_separators.c +++ b/llvm/test/tools/llvm-cov/native_separators.c @@ -1,20 +1,21 @@ // To create the covmapping for this file on Linux, copy this file to /tmp // cd into /tmp. Use llvm-cov convert-for-testing to extract the covmapping. // This test is Windows-only. It checks that all paths, which are generated -// in the index and source coverage reports, are native path. For example, -// on Windows all '/' are converted to '\'. +// in the index and source coverage reports, are native paths. For example, +// on Windows all '/' are converted to the native separator, the direction +// of which is controlled by LLVM_WINDOWS_PREFER_FORWARD_SLASH. // REQUIRES: system-windows // RUN: llvm-profdata merge %S/Inputs/double_dots.proftext -o %t.profdata // RUN: llvm-cov show %S/Inputs/native_separators.covmapping -instr-profile=%t.profdata -o %t.dir -// RUN: FileCheck -check-prefixes=TEXT-INDEX -input-file=%t.dir/index.txt %s +// RUN: FileCheck -check-prefixes=TEXT-INDEX -input-file=%t.dir/index.txt -DSEP=%{fs-sep} %s // RUN: llvm-cov show -format=html %S/Inputs/native_separators.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp,%S %S/../llvm-"config"/../llvm-"cov"/native_separators.c -o %t.dir -// RUN: FileCheck -check-prefixes=HTML-INDEX -input-file=%t.dir/index.html %s +// RUN: FileCheck -check-prefixes=HTML-INDEX -input-file=%t.dir/index.html -DSEP=%{fs-sep} %s // RUN: llvm-cov show -format=html %S/Inputs/native_separators.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp,%S %s -o %t.dir -// RUN: FileCheck -check-prefixes=HTML -input-file=%t.dir/coverage/tmp/native_separators.c.html %s +// RUN: FileCheck -check-prefixes=HTML -input-file=%t.dir/coverage/tmp/native_separators.c.html -DSEP=%{fs-sep} %s -// TEXT-INDEX: \tmp\native_separators.c -// HTML-INDEX: >tmp\native_separators.c -// HTML:
\tmp\native_separators.c
+// TEXT-INDEX: [[SEP]]tmp[[SEP]]native_separators.c +// HTML-INDEX: >tmp[[SEP]]native_separators.c +// HTML:
[[SEP]]tmp[[SEP]]native_separators.c
int main() {} diff --git a/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test b/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test index 0f8952daec42e..f75ba94353b68 100644 --- a/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test +++ b/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test @@ -6,6 +6,14 @@ ; RUN: sed -e "s,SRC_COMPDIR,/Inputs,g" %p/Inputs/source-interleave.ll > %t.ll ; RUN: llc -o %t.o -filetype=obj -mtriple=x86_64-pc-linux %t.ll -; RUN: llvm-objdump --prefix 'myprefix/\' --source %t.o 2>&1 | FileCheck %s -DFILE=%t.o -DPREFIX='myprefix' -; CHECK: warning: '[[FILE]]': failed to find source [[PREFIX]]/Inputs\source-interleave-x86_64.c +; RUN: llvm-objdump --prefix 'myprefix/\' --source %t.o 2>&1 | FileCheck %s -DFILE=%t.o -DPREFIX='myprefix' -DSEP=%{fs-sep} + +;; When --prefix is specified and the file path is absolute, sys::path::append +;; is used to add the file path to the prefix. On Windows, if the file path +;; starts with a slash, sys::path::append currently does a straight +;; concatenation, resulting in the first slash being preserved as-is ('/'). +;; The second slash (after 'Inputs') is generated by llc using +;; sys::path::append to join the directory and filename, which uses the +;; preferred path separator (controlled by LLVM_WINDOWS_PREFER_FORWARD_SLASH). +; CHECK: warning: '[[FILE]]': failed to find source [[PREFIX]]/Inputs[[SEP]]source-interleave-x86_64.c diff --git a/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp b/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp index 49c4c91c41c1a..28dfa044aab78 100644 --- a/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp +++ b/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp @@ -92,8 +92,7 @@ static Error writeFile(StringRef Filename, StringRef Data) { } static Error bundleImages() { - SmallVector BinaryData; - raw_svector_ostream OS(BinaryData); + SmallVector AllImages; for (StringRef Image : DeviceImages) { BumpPtrAllocator Alloc; StringSaver Saver(Alloc); @@ -128,16 +127,16 @@ static Error bundleImages() { ImageBinary.StringData[Key] = Value; } } - llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary); - if (Buffer.size() % OffloadBinary::getAlignment() != 0) - return createStringError(inconvertibleErrorCode(), - "Offload binary has invalid size alignment"); - OS << Buffer; + AllImages.emplace_back(std::move(ImageBinary)); } } - if (Error E = writeFile(OutputFile, - StringRef(BinaryData.begin(), BinaryData.size()))) + SmallString<0> Buffer = OffloadBinary::write(AllImages); + if (Buffer.size() % OffloadBinary::getAlignment() != 0) + return createStringError(inconvertibleErrorCode(), + "Offload binary has invalid size alignment"); + + if (Error E = writeFile(OutputFile, StringRef(Buffer.data(), Buffer.size()))) return E; return Error::success(); } diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp index fca2d298c460e..f2effbdddfbbf 100644 --- a/llvm/unittests/Support/CommandLineTest.cpp +++ b/llvm/unittests/Support/CommandLineTest.cpp @@ -1015,7 +1015,7 @@ TEST(CommandLineTest, ResponseFiles) { TEST(CommandLineTest, RecursiveResponseFiles) { vfs::InMemoryFileSystem FS; #ifdef _WIN32 - const char *TestRoot = "C:\\"; + const char *TestRoot = LLVM_WINDOWS_PREFER_FORWARD_SLASH ? "C:/" : "C:\\"; #else const char *TestRoot = "/"; #endif @@ -1085,7 +1085,7 @@ TEST(CommandLineTest, RecursiveResponseFiles) { TEST(CommandLineTest, ResponseFilesAtArguments) { vfs::InMemoryFileSystem FS; #ifdef _WIN32 - const char *TestRoot = "C:\\"; + const char *TestRoot = LLVM_WINDOWS_PREFER_FORWARD_SLASH ? "C:/" : "C:\\"; #else const char *TestRoot = "/"; #endif diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp index b196dc1d5452b..bb825a1b1e65c 100644 --- a/llvm/unittests/Support/Path.cpp +++ b/llvm/unittests/Support/Path.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" #include "llvm/BinaryFormat/Magic.h" +#include "llvm/Config/config.h" #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX #include "llvm/Support/Compiler.h" #include "llvm/Support/ConvertUTF.h" @@ -2792,7 +2793,9 @@ TEST_F(FileSystemTest, makeLongFormPath) { // Setup: A test directory longer than 8 characters for which a distinct // short 8.3 form name will be created on Windows. Typically, 123456~1. - constexpr const char *OneDir = "\\123456789"; // >8 chars + const char *OneDir = LLVM_WINDOWS_PREFER_FORWARD_SLASH + ? "/123456789" + : "\\123456789"; // >8 chars // Setup: Create a path where even if all components were reduced to short 8.3 // form names, the total length would exceed MAX_PATH. @@ -2825,6 +2828,8 @@ TEST_F(FileSystemTest, makeLongFormPath) { ASSERT_FALSE(DotAndDotDot.empty()) << "Expected short 8.3 form path for test directory."; auto ContainsDotAndDotDot = [](llvm::StringRef S) { + if (LLVM_WINDOWS_PREFER_FORWARD_SLASH) + return S.contains("/./") && S.contains("/../"); return S.contains("\\.\\") && S.contains("\\..\\"); }; ASSERT_TRUE(ContainsDotAndDotDot(DotAndDotDot)) diff --git a/llvm/unittests/Target/AArch64/InstSizes.cpp b/llvm/unittests/Target/AArch64/InstSizes.cpp index 9dffb6e600d62..898e3dae167b3 100644 --- a/llvm/unittests/Target/AArch64/InstSizes.cpp +++ b/llvm/unittests/Target/AArch64/InstSizes.cpp @@ -29,12 +29,18 @@ std::unique_ptr createTargetMachine() { std::nullopt, CodeGenOptLevel::Default)); } -std::unique_ptr createInstrInfo(TargetMachine *TM) { - AArch64Subtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), - std::string(TM->getTargetCPU()), - std::string(TM->getTargetFeatureString()), *TM, - /* isLittle */ false); - return std::make_unique(ST); +std::pair, std::unique_ptr> +createInstrInfo(TargetMachine *TM) { + auto ST = std::make_unique( + TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM, + /* isLittle */ false); + // The AArch64InstrInfo constructor takes a const reference to *ST, hence we + // cannot stack allocate *ST. + auto II = std::make_unique(*ST); + + return {std::move(ST), std::move(II)}; } /// The \p InputIRSnippet is only needed for things that can't be expressed in @@ -90,7 +96,7 @@ void runChecks( TEST(InstSizes, Authenticated) { std::unique_ptr TM = createTargetMachine(); ASSERT_TRUE(TM); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); auto isAuthInst = [](AArch64InstrInfo &II, MachineFunction &MF) { auto I = MF.begin()->begin(); @@ -122,7 +128,7 @@ TEST(InstSizes, Authenticated) { TEST(InstSizes, STACKMAP) { std::unique_ptr TM = createTargetMachine(); ASSERT_TRUE(TM); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " STACKMAP 0, 16\n" " STACKMAP 1, 32\n", @@ -136,7 +142,7 @@ TEST(InstSizes, STACKMAP) { TEST(InstSizes, PATCHPOINT) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " PATCHPOINT 0, 16, 0, 0, 0, csr_aarch64_aapcs\n" @@ -151,7 +157,7 @@ TEST(InstSizes, PATCHPOINT) { TEST(InstSizes, STATEPOINT) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " STATEPOINT 0, 0, 0, @sizes, 2, 0, 2, 0, 2, 0, 2, 1, 1, 8," @@ -164,7 +170,7 @@ TEST(InstSizes, STATEPOINT) { TEST(InstSizes, SPACE) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " $xzr = SPACE 1024, undef $xzr\n" @@ -179,7 +185,7 @@ TEST(InstSizes, SPACE) { TEST(InstSizes, TLSDESC_CALLSEQ) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks( TM.get(), II.get(), @@ -193,7 +199,7 @@ TEST(InstSizes, TLSDESC_CALLSEQ) { TEST(InstSizes, StoreSwiftAsyncContext) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks( TM.get(), II.get(), "", @@ -207,7 +213,7 @@ TEST(InstSizes, StoreSwiftAsyncContext) { TEST(InstSizes, SpeculationBarrierISBDSBEndBB) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks( TM.get(), II.get(), "", @@ -221,7 +227,7 @@ TEST(InstSizes, SpeculationBarrierISBDSBEndBB) { TEST(InstSizes, SpeculationBarrierSBEndBB) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks( TM.get(), II.get(), "", @@ -235,7 +241,7 @@ TEST(InstSizes, SpeculationBarrierSBEndBB) { TEST(InstSizes, JumpTable) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " $x10, $x11 = JumpTableDest32 $x9, $x8, %jump-table.0\n" @@ -253,7 +259,7 @@ TEST(InstSizes, JumpTable) { TEST(InstSizes, MOVaddr) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); auto Check8 = [](AArch64InstrInfo &II, MachineFunction &MF) { auto I = MF.begin()->begin(); @@ -296,7 +302,7 @@ TEST(InstSizes, MOVaddr) { TEST(InstSizes, MOVaddrTagged) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), " @g = external global i32\n", " $x0 = MOVaddr target-flags(aarch64-page, aarch64-tagged) @g, " @@ -309,7 +315,7 @@ TEST(InstSizes, MOVaddrTagged) { TEST(InstSizes, MOVi32imm) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " $w0 = MOVi32imm 1\n" @@ -324,7 +330,7 @@ TEST(InstSizes, MOVi32imm) { TEST(InstSizes, MOVi64imm) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " $x0 = MOVi64imm 1\n" @@ -342,7 +348,7 @@ TEST(InstSizes, MOVi64imm) { TEST(InstSizes, MOPSMemoryPseudos) { std::unique_ptr TM = createTargetMachine(); - std::unique_ptr II = createInstrInfo(TM.get()); + auto [ST, II] = createInstrInfo(TM.get()); runChecks(TM.get(), II.get(), "", " $x0, $x1, $x2 = MOPSMemoryMovePseudo $x0, $x1, $x2, " diff --git a/llvm/utils/instrumentor-config-wizard.py b/llvm/utils/instrumentor-config-wizard.py new file mode 100755 index 0000000000000..c3599ce0c47d8 --- /dev/null +++ b/llvm/utils/instrumentor-config-wizard.py @@ -0,0 +1,834 @@ +#!/usr/bin/env python3 +""" +Interactive wizard for configuring the LLVM Instrumentor pass. + +This script helps users create custom instrumentation configurations by: +1. Generating a default config file using opt +2. Presenting available instrumentation options interactively +3. Allowing users to enable/disable specific instrumentation opportunities +4. Saving the customized configuration to a JSON file +""" + +import argparse +import json +import os +import subprocess +import sys +import tempfile +from typing import Dict, List, Any, Optional, Tuple + + +class InstrumentorConfigWizard: + def __init__(self, opt_path: str = None): + """Initialize the wizard with the path to opt.""" + self.opt_path = opt_path or self.find_opt() + self.config = {} + self.enabled_opportunities = set() + self.same_pre_post = True + self.navigation_stack = [] + + def find_opt(self) -> str: + """Find the opt binary in the build directory.""" + # Try common locations relative to this script + script_dir = os.path.dirname(os.path.abspath(__file__)) + repo_root = os.path.dirname(os.path.dirname(script_dir)) + + # Check build/bin/opt + opt_candidates = [ + os.path.join(repo_root, "build", "bin", "opt"), + os.path.join(repo_root, "build", "Debug", "bin", "opt"), + os.path.join(repo_root, "build", "Release", "bin", "opt"), + "opt", # Try system PATH + ] + + for candidate in opt_candidates: + if os.path.exists(candidate): + return candidate + # Check if it's in PATH + try: + subprocess.run( + [candidate, "--version"], capture_output=True, check=True, timeout=5 + ) + return candidate + except ( + subprocess.CalledProcessError, + FileNotFoundError, + subprocess.TimeoutExpired, + ): + continue + + raise FileNotFoundError( + "Could not find 'opt' binary. Please specify the path using --opt-path" + ) + + def generate_default_config(self) -> Dict[str, Any]: + """Generate a default configuration by running opt.""" + print(f"Generating default configuration using: {self.opt_path}") + + # Create a minimal LLVM IR module to trigger config generation + minimal_ir = """ +define i32 @main() { + %1 = alloca i32 + store i32 0, ptr %1 + %2 = load i32, ptr %1 + ret i32 %2 +} +""" + + with tempfile.TemporaryDirectory() as tmpdir: + ir_file = os.path.join(tmpdir, "input.ll") + config_file = os.path.join(tmpdir, "config.json") + + # Write minimal IR + with open(ir_file, "w") as f: + f.write(minimal_ir) + + # Run opt with instrumentor to generate config + try: + cmd = [ + self.opt_path, + "-passes=instrumentor", + f"-instrumentor-write-config-file={config_file}", + "-disable-output", + ir_file, + ] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + + if result.returncode != 0: + print( + f"Warning: opt returned non-zero exit code: {result.returncode}" + ) + if result.stderr: + print(f"stderr: {result.stderr}") + + # Read the generated config + if not os.path.exists(config_file): + raise FileNotFoundError( + f"Config file was not generated at {config_file}" + ) + + with open(config_file, "r") as f: + config = json.load(f) + + print("✓ Default configuration generated successfully\n") + return config + + except subprocess.TimeoutExpired: + raise RuntimeError("opt command timed out") + except Exception as e: + raise RuntimeError(f"Failed to generate config: {e}") + + def clear_screen(self): + """Clear the terminal screen.""" + os.system("clear" if os.name != "nt" else "cls") + + def print_section_header(self, title: str): + """Print a formatted section header.""" + print("\n" + "=" * 70) + print(f" {title}") + print("=" * 70) + + def print_option(self, index: int, name: str, description: str, enabled: bool): + """Print a formatted option.""" + status = "[X]" if enabled else "[ ]" + print(f" {index:2d}. {status} {name:30s} - {description}") + + def get_user_choice( + self, prompt: str, valid_choices: List[str] = None, allow_back: bool = True + ) -> Optional[str]: + """Get user input with validation.""" + while True: + try: + nav_hint = " (b=back, q=quit)" if allow_back else " (q=quit)" + choice = input(prompt + nav_hint + ": ").strip().lower() + + if choice == "q": + confirm = input("Really quit? (y/n): ").strip().lower() + if confirm == "y": + print("\nWizard cancelled by user.") + sys.exit(0) + continue + + if choice == "b" and allow_back: + return "BACK" + + if not choice: + return "" + + if valid_choices: + if choice in valid_choices: + return choice + print(f"Please enter one of: {', '.join(valid_choices)}") + else: + return choice + + except KeyboardInterrupt: + print("\n\nWizard interrupted by user.") + sys.exit(0) + + def get_all_opportunity_types(self) -> List[Tuple[str, str]]: + """Extract all unique opportunity types from the config.""" + opportunities = [] + seen = set() + + for location in [ + "function_pre", + "function_post", + "instruction_pre", + "instruction_post", + ]: + if location not in self.config: + continue + + for opp_name in self.config[location].keys(): + if opp_name not in seen: + seen.add(opp_name) + # Get description from first occurrence + opp_config = self.config[location][opp_name] + desc = "No description available" + + # Try to find a description from any field + for key, value in opp_config.items(): + if key == "enabled": + continue + if key.endswith(".description") and value: + desc = value + break + + opportunities.append((opp_name, desc)) + + return sorted(opportunities) + + def select_opportunities(self) -> bool: + """Let user select which instrumentation opportunities to enable.""" + while True: + self.clear_screen() + self.print_section_header("Step 1: Select Instrumentation Types") + + opportunities = self.get_all_opportunity_types() + + print("\nSelect which types of instrumentation you want to configure:") + print( + "(You can toggle individual arguments for each type in the next steps)\n" + ) + + for idx, (opp_name, opp_desc) in enumerate(opportunities, 1): + enabled = opp_name in self.enabled_opportunities + self.print_option(idx, opp_name, opp_desc, enabled) + + print("\nCommands:") + print(" - Enter numbers (space-separated) to toggle opportunities") + print(" - 'all' to enable all, 'none' to disable all") + print(" - Press Enter when done to continue") + + choice = self.get_user_choice("\nYour choice", allow_back=False) + + if choice == "BACK": + continue + elif choice == "": + if not self.enabled_opportunities: + print("\n⚠ Please enable at least one instrumentation type!") + input("Press Enter to continue...") + continue + return True + elif choice == "all": + self.enabled_opportunities = {opp[0] for opp in opportunities} + elif choice == "none": + self.enabled_opportunities.clear() + else: + try: + indices = [int(x) for x in choice.split()] + for idx in indices: + if 1 <= idx <= len(opportunities): + opp_name = opportunities[idx - 1][0] + if opp_name in self.enabled_opportunities: + self.enabled_opportunities.remove(opp_name) + else: + self.enabled_opportunities.add(opp_name) + except ValueError: + print( + "\n⚠ Invalid input. Please enter numbers separated by spaces." + ) + input("Press Enter to continue...") + + def configure_pre_post_mode(self) -> bool: + """Ask if PRE and POST should have the same configuration.""" + while True: + self.clear_screen() + self.print_section_header("Step 2: PRE vs POST Configuration") + + print("\nInstrumentation can happen at two points:") + print(" - PRE: Before the instrumented operation") + print(" - POST: After the instrumented operation") + print("\nFor example, for a load instruction:") + print(" - PRE: Can inspect/modify the pointer before the load") + print(" - POST: Can inspect/modify the loaded value after the load") + + print( + f"\nCurrent mode: {'SAME configuration for PRE and POST' if self.same_pre_post else 'DIFFERENT configurations'}" + ) + + choice = self.get_user_choice( + "\nUse same configuration for PRE and POST? (y/n/Enter to keep)", + valid_choices=["y", "yes", "n", "no", ""], + ) + + if choice == "BACK": + return False + elif choice in ["y", "yes"]: + self.same_pre_post = True + return True + elif choice in ["n", "no"]: + self.same_pre_post = False + return True + elif choice == "": + return True + + def configure_base_options(self) -> bool: + """Configure base/global options.""" + while True: + self.clear_screen() + self.print_section_header("Step 3: Base Configuration") + + if "configuration" not in self.config: + self.config["configuration"] = {} + + base_config = self.config["configuration"] + + # Display current settings + print("\nCurrent settings:") + print( + f" 1. Runtime prefix: {base_config.get('runtime_prefix', '__instrumentor_')}" + ) + print( + f" 2. Demangle function names: {base_config.get('demangle_function_names', True)}" + ) + print( + f" 3. Target regex: {base_config.get('target_regex', '(none)')}" + ) + print( + f" 4. Host (CPU) enabled: {base_config.get('host_enabled', True)}" + ) + print( + f" 5. GPU enabled: {base_config.get('gpu_enabled', True)}" + ) + + print("\nEnter option number to modify, or press Enter to continue") + choice = self.get_user_choice("Option") + + if choice == "BACK": + return False + elif choice == "": + return True + elif choice == "1": + new_prefix = input("Enter runtime prefix: ").strip() + if new_prefix: + base_config["runtime_prefix"] = new_prefix + elif choice == "2": + demangle = self.get_user_choice( + "Demangle function names? (y/n)", ["y", "n"], allow_back=False + ) + if demangle: + base_config["demangle_function_names"] = demangle == "y" + elif choice == "3": + new_regex = input("Enter target regex (empty for none): ").strip() + base_config["target_regex"] = new_regex + elif choice == "4": + host = self.get_user_choice( + "Enable host instrumentation? (y/n)", ["y", "n"], allow_back=False + ) + if host: + base_config["host_enabled"] = host == "y" + elif choice == "5": + gpu = self.get_user_choice( + "Enable GPU instrumentation? (y/n)", ["y", "n"], allow_back=False + ) + if gpu: + base_config["gpu_enabled"] = gpu == "y" + + def configure_opportunity_args( + self, opp_name: str, location: str, step_prefix: str = "Step 4" + ) -> bool: + """Configure arguments for a specific opportunity at a location.""" + while True: + self.clear_screen() + location_desc = "PRE (before)" if "pre" in location else "POST (after)" + self.print_section_header( + f"{step_prefix}: Configure {opp_name} - {location_desc}" + ) + + if location not in self.config or opp_name not in self.config[location]: + print(f"\n⚠ {opp_name} not found in {location}") + input("Press Enter to continue...") + return True + + opp_config = self.config[location][opp_name] + + # Show enable/disable status + enabled = opp_config.get("enabled", False) + print(f"\nInstrumentation: {'ENABLED ✓' if enabled else 'DISABLED ✗'}") + + # Collect arguments + args = [] + for key, value in sorted(opp_config.items()): + if ( + key == "enabled" + or key.endswith(".description") + or key.endswith(".replace") + ): + continue + desc = opp_config.get(f"{key}.description", "No description") + can_replace = f"{key}.replace" in opp_config + replace_enabled = ( + opp_config.get(f"{key}.replace", False) if can_replace else False + ) + args.append((key, value, desc, can_replace, replace_enabled)) + + if args: + print("\nAvailable arguments:") + for idx, ( + arg_name, + arg_enabled, + arg_desc, + can_replace, + replace_enabled, + ) in enumerate(args, 1): + status = "[X]" if arg_enabled else "[ ]" + if can_replace: + replace_status = "REPLACE" if replace_enabled else "observe" + replace_mark = f" [replaceable: {replace_status}]" + else: + replace_mark = "" + print( + f" {idx:2d}. {status} {arg_name:25s} - {arg_desc}{replace_mark}" + ) + + print("\nCommands:") + print(" - 'e' to toggle enabled/disabled") + print(" - Enter numbers (space-separated) to toggle arguments") + print( + " - 'r ' to toggle replacement for replaceable argument (e.g., 'r 1')" + ) + print(" - 'all' to enable all args, 'none' to disable all args") + print(" - Press Enter when done") + + choice = self.get_user_choice("\nYour choice") + + if choice == "BACK": + return False + elif choice == "": + return True + elif choice == "e": + opp_config["enabled"] = not opp_config["enabled"] + elif choice == "all": + for arg_name, _, _, _, _ in args: + opp_config[arg_name] = True + elif choice == "none": + for arg_name, _, _, _, _ in args: + opp_config[arg_name] = False + elif choice.startswith("r "): + # Toggle replacement + try: + parts = choice.split() + if len(parts) == 2: + idx = int(parts[1]) + if 1 <= idx <= len(args): + arg_name, _, _, can_replace, _ = args[idx - 1] + if can_replace: + replace_key = f"{arg_name}.replace" + opp_config[replace_key] = not opp_config.get( + replace_key, False + ) + else: + print(f"\n⚠ Argument '{arg_name}' is not replaceable.") + input("Press Enter to continue...") + else: + print(f"\n⚠ Invalid argument number: {idx}") + input("Press Enter to continue...") + else: + print("\n⚠ Usage: r ") + input("Press Enter to continue...") + except ValueError: + print("\n⚠ Invalid input for replacement toggle.") + input("Press Enter to continue...") + else: + try: + indices = [int(x) for x in choice.split()] + for idx in indices: + if 1 <= idx <= len(args): + arg_name = args[idx - 1][0] + opp_config[arg_name] = not opp_config[arg_name] + except ValueError: + print("\n⚠ Invalid input.") + input("Press Enter to continue...") + + def configure_locations(self) -> bool: + """Configure all enabled opportunities for PRE and optionally POST.""" + # First, disable all opportunities that are not in enabled_opportunities + for location in [ + "function_pre", + "function_post", + "instruction_pre", + "instruction_post", + ]: + if location not in self.config: + continue + for opp_name, opp_config in self.config[location].items(): + if opp_name not in self.enabled_opportunities: + opp_config["enabled"] = False + + # Configure PRE locations + step_num = 4 + for idx, opp_name in enumerate(sorted(self.enabled_opportunities), 1): + # Try function_pre first, then instruction_pre + location = None + if ( + "function_pre" in self.config + and opp_name in self.config["function_pre"] + ): + location = "function_pre" + elif ( + "instruction_pre" in self.config + and opp_name in self.config["instruction_pre"] + ): + location = "instruction_pre" + + if location: + if not self.configure_opportunity_args(opp_name, location): + return False + + # If same config, copy PRE to POST + if self.same_pre_post: + for opp_name in self.enabled_opportunities: + # Copy from PRE to POST + if ( + "function_pre" in self.config + and opp_name in self.config["function_pre"] + ): + if ( + "function_post" in self.config + and opp_name in self.config["function_post"] + ): + pre_config = self.config["function_pre"][opp_name] + post_config = self.config["function_post"][opp_name] + # Copy enabled and argument settings + post_config["enabled"] = pre_config.get("enabled", False) + for key in pre_config: + if not key.endswith(".description") and key != "enabled": + if key in post_config: + post_config[key] = pre_config[key] + + if ( + "instruction_pre" in self.config + and opp_name in self.config["instruction_pre"] + ): + if ( + "instruction_post" in self.config + and opp_name in self.config["instruction_post"] + ): + pre_config = self.config["instruction_pre"][opp_name] + post_config = self.config["instruction_post"][opp_name] + post_config["enabled"] = pre_config.get("enabled", False) + for key in pre_config: + if not key.endswith(".description") and key != "enabled": + if key in post_config: + post_config[key] = pre_config[key] + else: + # Configure POST locations separately + for opp_name in sorted(self.enabled_opportunities): + location = None + if ( + "function_post" in self.config + and opp_name in self.config["function_post"] + ): + location = "function_post" + elif ( + "instruction_post" in self.config + and opp_name in self.config["instruction_post"] + ): + location = "instruction_post" + + if location: + if not self.configure_opportunity_args(opp_name, location): + return False + + return True + + def generate_runtime_stubs(self, config_path: str, stub_path: str) -> bool: + """Generate runtime stub file using the configuration.""" + print(f"\nGenerating runtime stubs using: {self.opt_path}") + + # Create a minimal LLVM IR module + minimal_ir = """ +define i32 @main() { + %1 = alloca i32 + store i32 0, ptr %1 + %2 = load i32, ptr %1 + ret i32 %2 +} +""" + + with tempfile.TemporaryDirectory() as tmpdir: + ir_file = os.path.join(tmpdir, "input.ll") + temp_config = os.path.join(tmpdir, "temp_config.json") + + # Write minimal IR + with open(ir_file, "w") as f: + f.write(minimal_ir) + + # Create a temporary config with stub file set + temp_cfg = self.config.copy() + if "configuration" not in temp_cfg: + temp_cfg["configuration"] = {} + temp_cfg["configuration"]["runtime_stubs_file"] = stub_path + + with open(temp_config, "w") as f: + json.dump(temp_cfg, f, indent=2) + + # Run opt with instrumentor to generate stubs + try: + cmd = [ + self.opt_path, + "-passes=instrumentor", + f"-instrumentor-read-config-file={temp_config}", + "-disable-output", + ir_file, + ] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + + if result.returncode != 0: + print( + f"Warning: opt returned non-zero exit code: {result.returncode}" + ) + if result.stderr: + print(f"stderr: {result.stderr}") + + # Check if stub file was generated + if os.path.exists(stub_path): + print(f"✓ Runtime stubs generated: {stub_path}") + return True + else: + print(f"✗ Stub file was not generated") + return False + + except subprocess.TimeoutExpired: + print("✗ opt command timed out") + return False + except Exception as e: + print(f"✗ Failed to generate stubs: {e}") + return False + + def review_and_save(self, output_path: str) -> bool: + """Review configuration and save.""" + stub_path = None + + while True: + self.clear_screen() + self.print_section_header("Step 5: Review and Save") + + print("\nEnabled instrumentation types:") + for opp in sorted(self.enabled_opportunities): + print(f" ✓ {opp}") + + print( + f"\nPRE/POST mode: {'Same configuration' if self.same_pre_post else 'Different configurations'}" + ) + print( + f"Runtime prefix: {self.config.get('configuration', {}).get('runtime_prefix', '__instrumentor_')}" + ) + print(f"\nConfiguration file: {output_path}") + if stub_path: + print(f"Runtime stubs file: {stub_path}") + + print("\nCommands:") + print(" - 's' to save configuration and finish") + print(" - 'g' to generate runtime stub file (optional)") + print(" - 'p' to specify different output path") + print(" - 'b' to go back and modify settings") + + choice = self.get_user_choice( + "\nYour choice", valid_choices=["s", "g", "p", "b", ""] + ) + + if choice == "BACK" or choice == "b": + return False + elif choice == "s": + try: + # Remove runtime_stubs_file from config before saving + config_to_save = json.loads(json.dumps(self.config)) + if "configuration" in config_to_save: + config_to_save["configuration"].pop("runtime_stubs_file", None) + config_to_save["configuration"].pop( + "runtime_stubs_file.description", None + ) + + with open(output_path, "w") as f: + json.dump(config_to_save, f, indent=2) + print(f"\n✓ Configuration saved to: {output_path}") + + # Generate stubs if requested + if stub_path: + self.generate_runtime_stubs(output_path, stub_path) + + return True + except Exception as e: + print(f"\n✗ Failed to save configuration: {e}") + input("Press Enter to continue...") + elif choice == "g": + print("\nGenerate runtime stub file") + print("This creates a C/C++ file with stub implementations of the") + print( + "instrumentation runtime functions that you can use as a template." + ) + + default_stub = output_path.rsplit(".", 1)[0] + "_stubs.c" + stub_input = input( + f"\nStub file path (default: {default_stub}): " + ).strip() + stub_path = stub_input if stub_input else default_stub + print(f"Will generate stubs to: {stub_path}") + input("Press Enter to continue...") + elif choice == "p": + new_path = input("Enter configuration output path: ").strip() + if new_path: + output_path = new_path + elif choice == "": + continue + + def run_interactive(self, output_path: str): + """Run the interactive configuration wizard.""" + self.clear_screen() + print("=" * 70) + print(" LLVM Instrumentor Configuration Wizard") + print("=" * 70) + print( + "\nThis wizard will help you create a custom instrumentation configuration." + ) + print("You can enable/disable instrumentation opportunities and configure") + print("what information is passed to the runtime functions.") + print("\nNavigation: Use 'b' to go back, 'q' to quit at any prompt.") + input("\nPress Enter to continue...") + + # Generate or load config + try: + self.config = self.generate_default_config() + except Exception as e: + print(f"Error: {e}") + return False + + # State machine for navigation + state = 0 + while True: + if state == 0: # Select opportunities + if self.select_opportunities(): + state = 1 + elif state == 1: # PRE/POST mode + if self.configure_pre_post_mode(): + state = 2 + else: + state = 0 + elif state == 2: # Base configuration + if self.configure_base_options(): + state = 3 + else: + state = 1 + elif state == 3: # Configure locations + if self.configure_locations(): + state = 4 + else: + state = 2 + elif state == 4: # Review and save + if self.review_and_save(output_path): + return True + else: + state = 3 + + +def main(): + parser = argparse.ArgumentParser( + description="Interactive wizard for configuring LLVM Instrumentor pass", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Interactive mode (recommended) + %(prog)s + + # Specify custom output location + %(prog)s -o my_config.json + + # Use specific opt binary + %(prog)s --opt-path /path/to/opt + + # Load existing config and modify it + %(prog)s --input existing_config.json -o modified_config.json + """, + ) + + parser.add_argument( + "-o", + "--output", + default="instrumentor_config.json", + help="Output configuration file (default: instrumentor_config.json)", + ) + + parser.add_argument( + "--opt-path", help="Path to the opt binary (default: auto-detect)" + ) + + parser.add_argument( + "--input", help="Load and modify an existing configuration file" + ) + + args = parser.parse_args() + + try: + wizard = InstrumentorConfigWizard(opt_path=args.opt_path) + + # Load existing config if provided + if args.input: + print(f"Loading existing configuration from: {args.input}") + with open(args.input, "r") as f: + wizard.config = json.load(f) + print("✓ Configuration loaded\n") + # Extract enabled opportunities from loaded config + for location in [ + "function_pre", + "function_post", + "instruction_pre", + "instruction_post", + ]: + if location in wizard.config: + for opp_name, opp_config in wizard.config[location].items(): + if opp_config.get("enabled", False): + wizard.enabled_opportunities.add(opp_name) + + success = wizard.run_interactive(args.output) + + if success: + print("\n" + "=" * 70) + print("Configuration complete!") + print("=" * 70) + print(f"\nTo use this configuration with opt:") + print(f" opt -passes=instrumentor \\") + print(f" -instrumentor-read-config-file={args.output} \\") + print(f" input.ll -S -o output.ll") + print(f"\nTo use with clang:") + print(f" clang -mllvm -enable-instrumentor \\") + print(f" -mllvm -instrumentor-read-config-file={args.output} \\") + print(f" input.c -o output") + return 0 + else: + return 1 + + except Exception as e: + print(f"\nFatal error: {e}", file=sys.stderr) + import traceback + + traceback.print_exc() + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h index 4115213b00f49..686c038b33a7f 100644 --- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h +++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h @@ -23,32 +23,6 @@ namespace llvm::omp::target::plugin { class L0DeviceTy; class L0ProgramTy; -/// Loop descriptor. -struct TgtLoopDescTy { - int64_t Lb = 0; // The lower bound of the i-th loop. - int64_t Ub = 0; // The upper bound of the i-th loop. - int64_t Stride = 0; // The stride of the i-th loop. - - bool operator==(const TgtLoopDescTy &other) const { - return Lb == other.Lb && Ub == other.Ub && Stride == other.Stride; - } -}; - -struct TgtNDRangeDescTy { - int32_t NumLoops = 0; // Number of loops/dimensions. - int32_t DistributeDim = 0; // Dimensions lower than this one - // must end up in one WG. - TgtLoopDescTy Levels[3]; // Up to 3 loops. - - bool operator==(const TgtNDRangeDescTy &other) const { - return NumLoops == other.NumLoops && DistributeDim == other.DistributeDim && - std::equal(Levels, Levels + 3, other.Levels); - } - bool operator!=(const TgtNDRangeDescTy &other) const { - return !(*this == other); - } -}; - /// Forward declaration. struct L0LaunchEnvTy; @@ -59,26 +33,9 @@ struct KernelPropertiesTy { uint32_t MaxThreadGroupSize = 0; uint32_t NumKernelArgs = 0; std::unique_ptr ArgSizes; - - /// Cached input parameters used in the previous launch. - int32_t NumTeams = -1; - int32_t ThreadLimit = -1; - - /// Cached parameters used in the previous launch. ze_kernel_indirect_access_flags_t IndirectAccessFlags = std::numeric_limits::max(); - uint32_t GroupSizes[3] = {0, 0, 0}; - ze_group_count_t GroupCounts{0, 0, 0}; - std::mutex Mtx; - - /// Check if we can reuse group parameters. - bool reuseGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn, - uint32_t *GroupSizesOut, L0LaunchEnvTy &KEnv) const; - - /// Update cached group parameters. - void cacheGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn, - const uint32_t *GroupSizesIn, L0LaunchEnvTy &KEnv); }; struct L0LaunchEnvTy { @@ -102,10 +59,6 @@ class L0KernelTy : public GenericKernelTy { // Kernel Properties. mutable KernelPropertiesTy Properties; - void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams, - uint32_t ThreadLimit, uint32_t *GroupSizes, - L0LaunchEnvTy &KEnv) const; - Error buildKernel(L0ProgramTy &Program); Error readKernelProperties(L0ProgramTy &Program); @@ -143,10 +96,6 @@ class L0KernelTy : public GenericKernelTy { } ze_kernel_handle_t getZeKernel() const { return zeKernel; } - - Error getGroupsShape(L0DeviceTy &Device, int32_t NumTeams, - int32_t ThreadLimit, uint32_t *GroupSizes, - L0LaunchEnvTy &KEnv) const; }; } // namespace llvm::omp::target::plugin diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp index 4a13637d2f0ce..8c4766a1b46e0 100644 --- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp +++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp @@ -19,28 +19,6 @@ namespace llvm::omp::target::plugin { -bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn, - const int32_t ThreadLimitIn, - uint32_t *GroupSizesOut, - L0LaunchEnvTy &KEnv) const { - if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit) - return false; - // Found matching input parameters. - std::copy_n(GroupSizes, 3, GroupSizesOut); - KEnv.GroupCounts = GroupCounts; - return true; -} - -void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn, - const int32_t ThreadLimitIn, - const uint32_t *GroupSizesIn, - L0LaunchEnvTy &KEnv) { - NumTeams = NumTeamsIn; - ThreadLimit = ThreadLimitIn; - std::copy_n(GroupSizesIn, 3, GroupSizes); - GroupCounts = KEnv.GroupCounts; -} - Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) { const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice()); auto &KernelPR = getProperties(); @@ -107,167 +85,6 @@ Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice, return Plugin::success(); } -void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device, - uint32_t NumTeams, - uint32_t ThreadLimit, - uint32_t *GroupSizes, - L0LaunchEnvTy &KEnv) const { - - const KernelPropertiesTy &KernelPR = getProperties(); - - const auto DeviceId = Device.getDeviceId(); - bool MaxGroupSizeForced = false; - bool MaxGroupCountForced = false; - uint32_t MaxGroupSize = Device.getMaxGroupSize(); - const auto &Option = Device.getPlugin().getOptions(); - const auto OptSubscRate = Option.SubscriptionRate; - auto &GroupCounts = KEnv.GroupCounts; - - uint32_t SIMDWidth = KernelPR.SIMDWidth; - uint32_t KernelWidth = KernelPR.Width; - uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize; - - if (KernelMaxThreadGroupSize < MaxGroupSize) { - MaxGroupSize = KernelMaxThreadGroupSize; - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Capping maximum team size to %" PRIu32 - " due to kernel constraints.\n", - MaxGroupSize); - } - - if (ThreadLimit > 0) { - MaxGroupSizeForced = true; - MaxGroupSize = ThreadLimit; - } - - uint32_t MaxGroupCount = 0; - if (NumTeams > 0) { - MaxGroupCount = NumTeams; - MaxGroupCountForced = true; - } - - if (MaxGroupCountForced) { - // If number of teams is specified by the user, then use KernelWidth. - // WIs per WG by default, so that it matches - // decideLoopKernelGroupArguments() behavior. - if (!MaxGroupSizeForced) { - MaxGroupSize = KernelWidth; - } - } else { - const uint32_t NumSubslices = Device.getNumSubslices(); - uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice(); - if (KEnv.HalfNumThreads) - NumThreadsPerSubslice /= 2; - - MaxGroupCount = NumSubslices * NumThreadsPerSubslice; - if (MaxGroupSizeForced) { - // Set group size for the HW capacity. - uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; - uint32_t NumGroupsPerSubslice = - (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup; - MaxGroupCount = NumGroupsPerSubslice * NumSubslices; - } else { - assert(!MaxGroupSizeForced && !MaxGroupCountForced); - assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) && - "Invalid maxGroupSize"); - // Maximize group size. - while (MaxGroupSize >= KernelWidth) { - uint32_t NumThreadsPerGroup = - (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; - - if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) { - uint32_t NumGroupsPerSubslice = - NumThreadsPerSubslice / NumThreadsPerGroup; - MaxGroupCount = NumGroupsPerSubslice * NumSubslices; - break; - } - MaxGroupSize -= KernelWidth; - } - } - } - - uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1}; - uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1}; - if (!MaxGroupCountForced) { - GRPCounts[0] *= OptSubscRate; - } - GroupCounts.groupCountX = GRPCounts[0]; - GroupCounts.groupCountY = GRPCounts[1]; - GroupCounts.groupCountZ = GRPCounts[2]; - std::copy(GRPSizes, GRPSizes + 3, GroupSizes); -} - -Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams, - int32_t ThreadLimit, uint32_t *GroupSizes, - L0LaunchEnvTy &KEnv) const { - - const auto DeviceId = Device.getDeviceId(); - const auto &KernelPR = getProperties(); - - // Read the most recent global thread limit and max teams. - const int32_t NumTeamsICV = 0; - const int32_t ThreadLimitICV = 0; - - bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG); - KEnv.HalfNumThreads = - Device.getPlugin().getOptions().ZeDebugEnabled && IsXeHPG; - uint32_t KernelWidth = KernelPR.Width; - uint32_t SIMDWidth = KernelPR.SIMDWidth; - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth); - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth); - assert(SIMDWidth <= KernelWidth && "Invalid SIMD width."); - - if (ThreadLimit > 0) { - // use thread_limit clause value default. - ODBG(OLDT_Kernel) << "Max team size is set to " << ThreadLimit - << " (thread_limit clause)"; - } else if (ThreadLimitICV > 0) { - // else use thread-limit-var ICV. - ThreadLimit = ThreadLimitICV; - ODBG(OLDT_Kernel) << "Max team size is set to " << ThreadLimit - << " (thread-limit-icv)"; - } - - size_t MaxThreadLimit = Device.getMaxGroupSize(); - // Set correct max group size if the kernel was compiled with explicit SIMD. - if (SIMDWidth == 1) - MaxThreadLimit = Device.getNumThreadsPerSubslice(); - - if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) { - MaxThreadLimit = KernelPR.MaxThreadGroupSize; - ODBG(OLDT_Kernel) << "Capping maximum team size to " << MaxThreadLimit - << " due to kernel constraints."; - } - - if (ThreadLimit > static_cast(MaxThreadLimit)) { - ThreadLimit = MaxThreadLimit; - ODBG(OLDT_Kernel) << "Max team size exceeds current maximum " - << MaxThreadLimit << ". Adjusted"; - } - // scope code to ease integration with downstream custom code. - { - if (NumTeams > 0) { - ODBG(OLDT_Kernel) << "Number of teams is set to " << NumTeams - << " (num_teams clause or no teams construct)"; - } else if (NumTeamsICV > 0) { - // OMP_NUM_TEAMS only matters, if num_teams() clause is absent. - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, - "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV); - - NumTeams = NumTeamsICV; - ODBG(OLDT_Kernel) << "Max number of teams is set to " << NumTeams - << " (OMP_NUM_TEAMS)"; - } - - decideKernelGroupArguments(Device, (uint32_t)NumTeams, - (uint32_t)ThreadLimit, GroupSizes, KEnv); - } - - return Plugin::success(); -} - static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device, ze_kernel_handle_t zeKernel, L0LaunchEnvTy &KEnv, @@ -379,41 +196,18 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device, Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv, uint32_t NumThreads[3], uint32_t NumBlocks[3]) const { - - bool HasUserDefinedGroups = NumThreads[0] != 0 && NumThreads[1] != 0 && - NumThreads[2] != 0 && NumBlocks[0] != 0 && - NumBlocks[1] != 0 && NumBlocks[2] != 0; + assert(NumThreads[0] > 0 && NumThreads[1] > 0 && NumThreads[2] > 0 && + "Pre-computed ThreadLimit values must be non-zero"); + assert(NumBlocks[0] > 0 && NumBlocks[1] > 0 && NumBlocks[2] > 0 && + "Pre-computed NumTeams values must be non-zero"); uint32_t GroupSizes[3]; - bool CanReuseParams = false; - - if (HasUserDefinedGroups) { - KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]}; - // Respect max group size attribute in the kernel. - uint32_t MaxGroupSize = KEnv.KernelPR.MaxThreadGroupSize; - GroupSizes[0] = std::min(MaxGroupSize, NumThreads[0]); - GroupSizes[1] = std::min(MaxGroupSize, NumThreads[1]); - GroupSizes[2] = std::min(MaxGroupSize, NumThreads[2]); - } else { - int32_t NumTeams = NumBlocks[0]; - int32_t ThreadLimit = NumThreads[0]; - if (NumTeams < 0) - NumTeams = 0; - if (ThreadLimit < 0) - ThreadLimit = 0; - - auto &KernelPR = KEnv.KernelPR; - // Check if we can reuse previous group parameters. - CanReuseParams = - KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv); - - if (!CanReuseParams) { - if (auto Err = - getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv)) - return Err; - KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv); - } - } + KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]}; + // Respect max group size attribute in the kernel. + uint32_t MaxGroupSize = KEnv.KernelPR.MaxThreadGroupSize; + GroupSizes[0] = std::min(MaxGroupSize, NumThreads[0]); + GroupSizes[1] = std::min(MaxGroupSize, NumThreads[1]); + GroupSizes[2] = std::min(MaxGroupSize, NumThreads[2]); auto DeviceId = l0Device.getDeviceId(); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, @@ -424,10 +218,8 @@ Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv, KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY, KEnv.GroupCounts.groupCountZ); - if (!CanReuseParams) { - CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0], - GroupSizes[1], GroupSizes[2]); - } + CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0], + GroupSizes[1], GroupSizes[2]); return Plugin::success(); } diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 6b64764110c73..ff7ee4f8567f7 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -2303,6 +2303,7 @@ cc_library( deps = [ ":PluginDynamicLoaderPosixDYLDHeaders", ":PluginObjectFileELF", + ":PluginObjectFilePlaceholder", ":PluginProcessUtility", "//lldb:Core", "//lldb:Target",