-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[CostModel][X86] Update baseline CTTZ/CTLZ costs for x86_64 #124312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Followup to llvm#123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesFollowup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs Patch is 34.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124312.diff 10 Files Affected:
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074c..cdc2ce752743cc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4329,9 +4329,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
{ ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
- { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
{ ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
- { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
{ ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
{ ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
{ ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
index da0f71c63ef80e..9f8e4edf7a0fc1 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
index 2425e7286265b0..fc3516695852aa 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index fa7982ce09e9ce..d9d04de12467da 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 07bf1dd7a2ff6c..621c1b9320fc8d 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -40,7 +40,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index afe5cb8c55fe65..34d363ce008795 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
@@ -70,7 +70,7 @@ define i32 @var_cttz_i32u(i32 %a) {
define i16 @var_cttz_i16(i16 %a) {
; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %cttz
;
; BMI-LABEL: 'var_cttz_i16'
@@ -96,7 +96,7 @@ define i16 @var_cttz_i16u(i16 %a) {
define i8 @var_cttz_i8(i8 %a) {
; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %cttz
;
; BMI-LABEL: 'var_cttz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index fa0f10f886f633..3f5a731b27d9bc 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 062e5f157bae25..bcef47ee9e0567 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -232,7 +232,7 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
define void @cttz(i32 %a, <16 x i32> %va) {
; THRU-LABEL: 'cttz'
-; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; THRU-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
@@ -242,12 +242,12 @@ define void @cttz(i32 %a, <16 x i32> %va) {
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'cttz'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 8a22e45fe1ca57..9bf2ade3176d60 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -136,32 +136,47 @@ define void @ctlz_4i64() #0 {
}
define void @ctlz_4i32() #0 {
-; SSE2-LABEL: @ctlz_4i32(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; SSE2-NEXT: ret void
+; SSE-LABEL: @ctlz_4i32(
+; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
+; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @ctlz_4i32(
+; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
+; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT: ret void
;
-; SSE4-LABEL: @ctlz_4i32(
-; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
-; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
-; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
-; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
-; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
-; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
-; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
-; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
-; SSE4-NEXT: ret void
+; AVX2-LABEL: @ctlz_4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX2-NEXT: ret void
;
-; AVX-LABEL: @ctlz_4i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; AVX-NEXT: ret void
+; AVX512-LABEL: @ctlz_4i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX512-NEXT: ret void
;
%ld0 = load i32, ptr @src32, align 4
%ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -179,47 +194,71 @@ define void @ctlz_4i32() #0 {
}
define void @ctlz_8i32() #0 {
-; SSE2-LABEL: @ctlz_8i32(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
-; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT: stor...
[truncated]
|
@llvm/pr-subscribers-llvm-analysis Author: Simon Pilgrim (RKSimon) ChangesFollowup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs Patch is 34.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124312.diff 10 Files Affected:
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074c..cdc2ce752743cc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4329,9 +4329,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
{ ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
- { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
+ { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
{ ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
- { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
+ { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
{ ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
{ ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
{ ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
index da0f71c63ef80e..9f8e4edf7a0fc1 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
index 2425e7286265b0..fc3516695852aa 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
define i16 @var_ctlz_i16(i16 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
define i8 @var_ctlz_i8(i8 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index fa7982ce09e9ce..d9d04de12467da 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
define i64 @var_ctlz_i64(i64 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
define i32 @var_ctlz_i32(i32 %a) {
; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
;
; LZCNT-LABEL: 'var_ctlz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 07bf1dd7a2ff6c..621c1b9320fc8d 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -40,7 +40,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index afe5cb8c55fe65..34d363ce008795 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
@@ -70,7 +70,7 @@ define i32 @var_cttz_i32u(i32 %a) {
define i16 @var_cttz_i16(i16 %a) {
; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %cttz
;
; BMI-LABEL: 'var_cttz_i16'
@@ -96,7 +96,7 @@ define i16 @var_cttz_i16u(i16 %a) {
define i8 @var_cttz_i8(i8 %a) {
; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %cttz
;
; BMI-LABEL: 'var_cttz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index fa0f10f886f633..3f5a731b27d9bc 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
define i64 @var_cttz_i64(i64 %a) {
; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz
;
; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
define i32 @var_cttz_i32(i32 %a) {
; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz
;
; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 062e5f157bae25..bcef47ee9e0567 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -232,7 +232,7 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
define void @cttz(i32 %a, <16 x i32> %va) {
; THRU-LABEL: 'cttz'
-; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; THRU-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
@@ -242,12 +242,12 @@ define void @cttz(i32 %a, <16 x i32> %va) {
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'cttz'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 8a22e45fe1ca57..9bf2ade3176d60 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -136,32 +136,47 @@ define void @ctlz_4i64() #0 {
}
define void @ctlz_4i32() #0 {
-; SSE2-LABEL: @ctlz_4i32(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; SSE2-NEXT: ret void
+; SSE-LABEL: @ctlz_4i32(
+; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
+; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @ctlz_4i32(
+; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
+; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
+; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT: ret void
;
-; SSE4-LABEL: @ctlz_4i32(
-; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4
-; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
-; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
-; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
-; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4
-; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
-; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
-; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
-; SSE4-NEXT: ret void
+; AVX2-LABEL: @ctlz_4i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX2-NEXT: ret void
;
-; AVX-LABEL: @ctlz_4i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; AVX-NEXT: ret void
+; AVX512-LABEL: @ctlz_4i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX512-NEXT: ret void
;
%ld0 = load i32, ptr @src32, align 4
%ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -179,47 +194,71 @@ define void @ctlz_4i32() #0 {
}
define void @ctlz_8i32() #0 {
-; SSE2-LABEL: @ctlz_8i32(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
-; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT: stor...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff a94226f9e6f5be4d6978134e7813f22b0510f3d4 96a7660b22a4d5cabb022ddbb8b2d2fbd5653b72 --extensions cpp -- llvm/lib/Target/X86/X86TargetTransformInfo.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cdc2ce7527..9a60dd36ed 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4325,37 +4325,38 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
{ ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
};
- static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
- { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
- { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
- { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
- { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
- { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
- { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
- { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
- { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
- { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
- { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
- { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
- { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
- { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
- { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
- { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
- { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
- { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
- { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
- { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
- { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
- { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
- { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
- { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
- { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
- { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
- { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
- { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
- { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
- { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
- { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
+ static const CostKindTblEntry X64CostTbl[] = {
+ // 64-bit targets
+ {ISD::ABS, MVT::i64, {1, 2, 3, 3}}, // SUB+CMOV
+ {ISD::BITREVERSE, MVT::i64, {10, 12, 20, 22}},
+ {ISD::BSWAP, MVT::i64, {1, 2, 1, 2}},
+ {ISD::CTLZ, MVT::i64, {1, 2, 3, 3}}, // MOV+BSR+XOR
+ {ISD::CTLZ, MVT::i32, {1, 2, 3, 3}}, // MOV+BSR+XOR
+ {ISD::CTLZ, MVT::i16, {2, 2, 3, 3}}, // MOV+BSR+XOR
+ {ISD::CTLZ, MVT::i8, {2, 2, 4, 3}}, // MOV+BSR+XOR
+ {ISD::CTLZ_ZERO_UNDEF, MVT::i64, {1, 2, 2, 2}}, // BSR+XOR
+ {ISD::CTTZ, MVT::i64, {1, 2, 2, 2}}, // MOV+BSF
+ {ISD::CTTZ, MVT::i32, {1, 2, 2, 2}}, // MOV+BSF
+ {ISD::CTTZ, MVT::i16, {2, 2, 2, 2}}, // MOV+BSF
+ {ISD::CTTZ, MVT::i8, {2, 2, 2, 2}}, // MOV+BSF
+ {ISD::CTTZ_ZERO_UNDEF, MVT::i64, {1, 2, 1, 2}}, // BSF
+ {ISD::CTPOP, MVT::i64, {10, 6, 19, 19}},
+ {ISD::ROTL, MVT::i64, {2, 3, 1, 3}},
+ {ISD::ROTR, MVT::i64, {2, 3, 1, 3}},
+ {X86ISD::VROTLI, MVT::i64, {1, 1, 1, 1}},
+ {ISD::FSHL, MVT::i64, {4, 4, 1, 4}},
+ {ISD::SADDSAT, MVT::i64, {4, 4, 7, 10}},
+ {ISD::SSUBSAT, MVT::i64, {4, 5, 8, 11}},
+ {ISD::UADDSAT, MVT::i64, {2, 3, 4, 7}},
+ {ISD::USUBSAT, MVT::i64, {2, 3, 4, 7}},
+ {ISD::SMAX, MVT::i64, {1, 3, 2, 3}},
+ {ISD::SMIN, MVT::i64, {1, 3, 2, 3}},
+ {ISD::UMAX, MVT::i64, {1, 3, 2, 3}},
+ {ISD::UMIN, MVT::i64, {1, 3, 2, 3}},
+ {ISD::SADDO, MVT::i64, {2, 2, 4, 6}},
+ {ISD::UADDO, MVT::i64, {2, 2, 4, 6}},
+ {ISD::SMULO, MVT::i64, {4, 4, 4, 6}},
+ {ISD::UMULO, MVT::i64, {8, 8, 4, 7}},
};
static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/50/builds/9423 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/52/builds/5567 Here is the relevant piece of the build log for the reference
|
Followup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs