Skip to content

Commit c6a0548

Browse files
committed
[AArch64] Improve cost computations for odd vector mem ops.
Improve cost computaton for odd vector mem ops by breaking them down into smaller power-of-2 parts and sum up the cost of those parts. This fixes the current cost estimates, which for most parts underestimated the cos, due to using getTypeLegalizationCost, which widens to the next power-of-2 in a single step in most cases. This doesn't reflect the actual cost. See https://llvm.godbolt.org/z/vMsnxMf1v for codegen for the tests. Note that there is a special case for v3i8, for which current codegen is pretty bad, due to automatic widening to v4i8, which in turn requires the conversion to go through memory ops in the stack. I am planning on fixing that as a follow-up, but I am not yet sure where to best fix this. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
1 parent 2eb71e8 commit c6a0548

File tree

2 files changed

+73
-40
lines changed

2 files changed

+73
-40
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+41-8
Original file line numberDiff line numberDiff line change
@@ -3176,14 +3176,47 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
31763176
if (Ty->isPtrOrPtrVectorTy())
31773177
return LT.first;
31783178

3179-
// Check truncating stores and extending loads.
3180-
if (useNeonVector(Ty) &&
3181-
Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3182-
// v4i8 types are lowered to scalar a load/store and sshll/xtn.
3183-
if (VT == MVT::v4i8)
3184-
return 2;
3185-
// Otherwise we need to scalarize.
3186-
return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3179+
if (useNeonVector(Ty)) {
3180+
// Check truncating stores and extending loads.
3181+
if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3182+
// v4i8 types are lowered to scalar a load/store and sshll/xtn.
3183+
if (VT == MVT::v4i8)
3184+
return 2;
3185+
// Otherwise we need to scalarize.
3186+
return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3187+
}
3188+
EVT EltVT = VT.getVectorElementType();
3189+
unsigned EltSize = EltVT.getScalarSizeInBits();
3190+
if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3191+
VT.getVectorNumElements() >= 128 / EltSize || !Alignment ||
3192+
*Alignment != Align(1))
3193+
return LT.first;
3194+
// FIXME: v3i8 lowering currently is very inefficient, due to automatic
3195+
// widening to v4i8, which produces suboptimal results.
3196+
if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3197+
return LT.first;
3198+
3199+
// Check non-power-of-2 loads/stores for legal vector element types with
3200+
// NEON. Non-power-of-2 memory ops will get broken down to a set of
3201+
// operations on smaller power-of-2 ops, including ld1/st1.
3202+
LLVMContext &C = Ty->getContext();
3203+
InstructionCost Cost(0);
3204+
SmallVector<EVT> TypeWorklist;
3205+
TypeWorklist.push_back(VT);
3206+
while (!TypeWorklist.empty()) {
3207+
EVT CurrVT = TypeWorklist.pop_back_val();
3208+
unsigned CurrNumElements = CurrVT.getVectorNumElements();
3209+
if (isPowerOf2_32(CurrNumElements)) {
3210+
Cost += 1;
3211+
continue;
3212+
}
3213+
3214+
unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3215+
TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3216+
TypeWorklist.push_back(
3217+
EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3218+
}
3219+
return Cost;
31873220
}
31883221

31893222
return LT.first;

llvm/test/Analysis/CostModel/AArch64/vec3-ops.ll

+32-32
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33

44
define void @vec3_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) {
55
; CHECK-LABEL: 'vec3_i32'
6-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <3 x i32>, ptr %src, align 1
6+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x i32>, ptr %src, align 1
77
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i32> %l, %b
88
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp uge <3 x i32> %add, %a
99
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i32> %add, %a
1010
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sel = select <3 x i1> %cmp, <3 x i32> %add, <3 x i32> %sub
11-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> %sel, ptr %dst, align 1
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x i32> %sel, ptr %dst, align 1
1212
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
1313
;
1414
%l = load <3 x i32>, ptr %src, align 1
@@ -41,12 +41,12 @@ define void @vec3_i32_default_alignment(<3 x i32> %a, <3 x i32> %b, ptr %src, pt
4141

4242
define void @vec3_i16(<3 x i16> %a, <3 x i16> %b, ptr %src, ptr %dst) {
4343
; CHECK-LABEL: 'vec3_i16'
44-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <3 x i16>, ptr %src, align 1
44+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x i16>, ptr %src, align 1
4545
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i16> %l, %b
4646
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp uge <3 x i16> %add, %a
4747
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i16> %add, %a
4848
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sel = select <3 x i1> %cmp, <3 x i16> %add, <3 x i16> %sub
49-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> %sel, ptr %dst, align 1
49+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x i16> %sel, ptr %dst, align 1
5050
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
5151
;
5252
%l = load <3 x i16>, ptr %src, align 1
@@ -60,9 +60,9 @@ define void @vec3_i16(<3 x i16> %a, <3 x i16> %b, ptr %src, ptr %dst) {
6060

6161
define void @vec7_i16(ptr %src, ptr %dst) {
6262
; CHECK-LABEL: 'vec7_i16'
63-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <7 x i16>, ptr %src, align 1
63+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <7 x i16>, ptr %src, align 1
6464
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <7 x i16> %l, %l
65-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <7 x i16> %add, ptr %dst, align 1
65+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <7 x i16> %add, ptr %dst, align 1
6666
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
6767
;
6868
%l = load <7 x i16>, ptr %src, align 1
@@ -73,9 +73,9 @@ define void @vec7_i16(ptr %src, ptr %dst) {
7373

7474
define void @vec6_i16(ptr %src, ptr %dst) {
7575
; CHECK-LABEL: 'vec6_i16'
76-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <6 x i16>, ptr %src, align 1
76+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <6 x i16>, ptr %src, align 1
7777
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <6 x i16> %l, %l
78-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <6 x i16> %add, ptr %dst, align 1
78+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <6 x i16> %add, ptr %dst, align 1
7979
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
8080
;
8181
%l = load <6 x i16>, ptr %src, align 1
@@ -86,9 +86,9 @@ define void @vec6_i16(ptr %src, ptr %dst) {
8686

8787
define void @vec5_i16(ptr %src, ptr %dst) {
8888
; CHECK-LABEL: 'vec5_i16'
89-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <5 x i16>, ptr %src, align 1
89+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <5 x i16>, ptr %src, align 1
9090
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <5 x i16> %l, %l
91-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <5 x i16> %add, ptr %dst, align 1
91+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <5 x i16> %add, ptr %dst, align 1
9292
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
9393
;
9494
%l = load <5 x i16>, ptr %src, align 1
@@ -99,12 +99,12 @@ define void @vec5_i16(ptr %src, ptr %dst) {
9999

100100
define void @vec3_i16_zext_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) {
101101
; CHECK-LABEL: 'vec3_i16_zext_i32'
102-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <3 x i16>, ptr %src, align 1
102+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x i16>, ptr %src, align 1
103103
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l.ext = zext <3 x i16> %l to <3 x i32>
104104
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i32> %l.ext, %b
105105
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i32> %add, %a
106106
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub.trunc = trunc <3 x i32> %sub to <3 x i16>
107-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> %sub.trunc, ptr %dst, align 1
107+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x i16> %sub.trunc, ptr %dst, align 1
108108
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
109109
;
110110
%l = load <3 x i16>, ptr %src, align 1
@@ -190,10 +190,10 @@ define void @vec3_i30(<3 x i30> %a, <3 x i30> %b, ptr %src, ptr %dst) {
190190

191191
define void @vec3_float(<3 x float> %a, <3 x float> %b, ptr %src, ptr %dst) {
192192
; CHECK-LABEL: 'vec3_float'
193-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <3 x float>, ptr %src, align 1
193+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x float>, ptr %src, align 1
194194
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = fadd <3 x float> %l, %b
195195
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = fsub <3 x float> %add, %a
196-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x float> %sub, ptr %dst, align 1
196+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x float> %sub, ptr %dst, align 1
197197
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
198198
;
199199
%l = load <3 x float>, ptr %src, align 1
@@ -205,10 +205,10 @@ define void @vec3_float(<3 x float> %a, <3 x float> %b, ptr %src, ptr %dst) {
205205

206206
define void @vec3_half(<3 x half> %a, <3 x half> %b, ptr %src, ptr %dst) {
207207
; CHECK-LABEL: 'vec3_half'
208-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <3 x half>, ptr %src, align 1
208+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x half>, ptr %src, align 1
209209
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add = fadd <3 x half> %l, %b
210210
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub = fsub <3 x half> %add, %a
211-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x half> %sub, ptr %dst, align 1
211+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x half> %sub, ptr %dst, align 1
212212
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
213213
;
214214
%l = load <3 x half>, ptr %src, align 1
@@ -220,9 +220,9 @@ define void @vec3_half(<3 x half> %a, <3 x half> %b, ptr %src, ptr %dst) {
220220

221221
define void @vec15_i8(ptr %src, ptr %dst) {
222222
; CHECK-LABEL: 'vec15_i8'
223-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <15 x i8>, ptr %src, align 1
223+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %l = load <15 x i8>, ptr %src, align 1
224224
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <15 x i8> %l, %l
225-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <15 x i8> %add, ptr %dst, align 1
225+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <15 x i8> %add, ptr %dst, align 1
226226
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
227227
;
228228
%l = load <15 x i8>, ptr %src, align 1
@@ -233,9 +233,9 @@ define void @vec15_i8(ptr %src, ptr %dst) {
233233

234234
define void @vec14_i8(ptr %src, ptr %dst) {
235235
; CHECK-LABEL: 'vec14_i8'
236-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <14 x i8>, ptr %src, align 1
236+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <14 x i8>, ptr %src, align 1
237237
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <14 x i8> %l, %l
238-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <14 x i8> %add, ptr %dst, align 1
238+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <14 x i8> %add, ptr %dst, align 1
239239
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
240240
;
241241
%l = load <14 x i8>, ptr %src, align 1
@@ -246,9 +246,9 @@ define void @vec14_i8(ptr %src, ptr %dst) {
246246

247247
define void @vec13_i8(ptr %src, ptr %dst) {
248248
; CHECK-LABEL: 'vec13_i8'
249-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <13 x i8>, ptr %src, align 1
249+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <13 x i8>, ptr %src, align 1
250250
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <13 x i8> %l, %l
251-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <13 x i8> %add, ptr %dst, align 1
251+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <13 x i8> %add, ptr %dst, align 1
252252
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
253253
;
254254
%l = load <13 x i8>, ptr %src, align 1
@@ -259,9 +259,9 @@ define void @vec13_i8(ptr %src, ptr %dst) {
259259

260260
define void @vec12_i8(ptr %src, ptr %dst) {
261261
; CHECK-LABEL: 'vec12_i8'
262-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <12 x i8>, ptr %src, align 1
262+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <12 x i8>, ptr %src, align 1
263263
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <12 x i8> %l, %l
264-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <12 x i8> %add, ptr %dst, align 1
264+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <12 x i8> %add, ptr %dst, align 1
265265
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
266266
;
267267
%l = load <12 x i8>, ptr %src, align 1
@@ -272,9 +272,9 @@ define void @vec12_i8(ptr %src, ptr %dst) {
272272

273273
define void @vec11_i8(ptr %src, ptr %dst) {
274274
; CHECK-LABEL: 'vec11_i8'
275-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <11 x i8>, ptr %src, align 1
275+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <11 x i8>, ptr %src, align 1
276276
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <11 x i8> %l, %l
277-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <11 x i8> %add, ptr %dst, align 1
277+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <11 x i8> %add, ptr %dst, align 1
278278
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
279279
;
280280
%l = load <11 x i8>, ptr %src, align 1
@@ -285,10 +285,10 @@ define void @vec11_i8(ptr %src, ptr %dst) {
285285

286286
define void @vec7_i8(<7 x i8> %a, <7 x i8> %b, ptr %src, ptr %dst) {
287287
; CHECK-LABEL: 'vec7_i8'
288-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <7 x i8>, ptr %src, align 1
288+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <7 x i8>, ptr %src, align 1
289289
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <7 x i8> %l, %b
290290
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <7 x i8> %add, %a
291-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <7 x i8> %sub, ptr %dst, align 1
291+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <7 x i8> %sub, ptr %dst, align 1
292292
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
293293
;
294294
%l = load <7 x i8>, ptr %src, align 1
@@ -300,9 +300,9 @@ define void @vec7_i8(<7 x i8> %a, <7 x i8> %b, ptr %src, ptr %dst) {
300300

301301
define void @vec6_i8(ptr %src, ptr %dst) {
302302
; CHECK-LABEL: 'vec6_i8'
303-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <6 x i8>, ptr %src, align 1
303+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <6 x i8>, ptr %src, align 1
304304
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <6 x i8> %l, %l
305-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <6 x i8> %add, ptr %dst, align 1
305+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <6 x i8> %add, ptr %dst, align 1
306306
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
307307
;
308308
%l = load <6 x i8>, ptr %src, align 1
@@ -313,9 +313,9 @@ define void @vec6_i8(ptr %src, ptr %dst) {
313313

314314
define void @vec5_i8(ptr %src, ptr %dst) {
315315
; CHECK-LABEL: 'vec5_i8'
316-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <5 x i8>, ptr %src, align 1
316+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <5 x i8>, ptr %src, align 1
317317
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <5 x i8> %l, %l
318-
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <5 x i8> %add, ptr %dst, align 1
318+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <5 x i8> %add, ptr %dst, align 1
319319
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
320320
;
321321
%l = load <5 x i8>, ptr %src, align 1

0 commit comments

Comments
 (0)