Skip to content

Commit a786cde

Browse files
committed
[AArch64] Add custom lowering for load <3 x i8>.
Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, swiftlang#2] ldrh wY, [x0] orr wX, wY, wX, lsl swiftlang#16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
1 parent 8336515 commit a786cde

File tree

2 files changed

+65
-33
lines changed

2 files changed

+65
-33
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+52-2
Original file line numberDiff line numberDiff line change
@@ -21095,6 +21095,50 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
2109521095
return SDValue();
2109621096
}
2109721097

21098+
// A custom combine to lower load <3 x i8> as the more efficient sequence
21099+
// below:
21100+
// ldrb wX, [x0, #2]
21101+
// ldrh wY, [x0]
21102+
// orr wX, wY, wX, lsl #16
21103+
// fmov s0, wX
21104+
//
21105+
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
21106+
EVT MemVT = LD->getMemoryVT();
21107+
if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21108+
LD->getOriginalAlign() >= 4)
21109+
return SDValue();
21110+
21111+
SDLoc DL(LD);
21112+
SDValue Chain = LD->getChain();
21113+
SDValue BasePtr = LD->getBasePtr();
21114+
21115+
// Load 2 x i8, then 1 x i8.
21116+
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(),
21117+
LD->getOriginalAlign());
21118+
SDValue L8 =
21119+
DAG.getLoad(MVT::i8, DL, Chain,
21120+
DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL),
21121+
LD->getPointerInfo(), LD->getOriginalAlign());
21122+
21123+
// Extend to i32.
21124+
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21125+
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21126+
21127+
// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21128+
SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21129+
DAG.getConstant(16, DL, MVT::i32));
21130+
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr);
21131+
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21132+
21133+
// Extract v3i8 again.
21134+
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21135+
DAG.getConstant(0, DL, MVT::i64));
21136+
SDValue TokenFactor = DAG.getNode(
21137+
ISD::TokenFactor, DL, MVT::Other,
21138+
{SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21139+
return DAG.getMergeValues({Extract, TokenFactor}, DL);
21140+
}
21141+
2109821142
// Perform TBI simplification if supported by the target and try to break up
2109921143
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
2110021144
// load instructions can be selected.
@@ -21106,10 +21150,16 @@ static SDValue performLOADCombine(SDNode *N,
2110621150
performTBISimplification(N->getOperand(1), DCI, DAG);
2110721151

2110821152
LoadSDNode *LD = cast<LoadSDNode>(N);
21109-
EVT MemVT = LD->getMemoryVT();
21110-
if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
21153+
if (LD->isVolatile() || !Subtarget->isLittleEndian())
21154+
return SDValue(N, 0);
21155+
21156+
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21157+
return Res;
21158+
21159+
if (!LD->isNonTemporal())
2111121160
return SDValue(N, 0);
2111221161

21162+
EVT MemVT = LD->getMemoryVT();
2111321163
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
2111421164
MemVT.getSizeInBits() % 256 == 0 ||
2111521165
256 % MemVT.getScalarSizeInBits() != 0)

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

+13-31
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,10 @@
55
define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
66
; CHECK-LABEL: load_v3i8:
77
; CHECK: ; %bb.0:
8-
; CHECK-NEXT: sub sp, sp, #16
9-
; CHECK-NEXT: .cfi_def_cfa_offset 16
10-
; CHECK-NEXT: ldrh w8, [x0]
11-
; CHECK-NEXT: strh w8, [sp, #12]
12-
; CHECK-NEXT: ldr s0, [sp, #12]
13-
; CHECK-NEXT: ushll.8h v0, v0, #0
14-
; CHECK-NEXT: umov.h w8, v0[0]
15-
; CHECK-NEXT: umov.h w9, v0[1]
8+
; CHECK-NEXT: ldrb w8, [x0, #2]
9+
; CHECK-NEXT: ldrh w9, [x0]
10+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
1611
; CHECK-NEXT: fmov s0, w8
17-
; CHECK-NEXT: add x8, x0, #2
18-
; CHECK-NEXT: mov.b v0[1], w9
19-
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
20-
; CHECK-NEXT: add sp, sp, #16
2112
; CHECK-NEXT: ret
2213
;
2314
; BE-LABEL: load_v3i8:
@@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
4738
define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) {
4839
; CHECK-LABEL: load_v3i8_to_4xi32:
4940
; CHECK: ; %bb.0:
50-
; CHECK-NEXT: sub sp, sp, #16
51-
; CHECK-NEXT: .cfi_def_cfa_offset 16
52-
; CHECK-NEXT: ldrh w8, [x0]
41+
; CHECK-NEXT: ldrb w8, [x0, #2]
42+
; CHECK-NEXT: ldrh w9, [x0]
5343
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
54-
; CHECK-NEXT: strh w8, [sp, #12]
55-
; CHECK-NEXT: ldr s0, [sp, #12]
56-
; CHECK-NEXT: ldrsb w8, [x0, #2]
57-
; CHECK-NEXT: ushll.8h v0, v0, #0
58-
; CHECK-NEXT: mov.h v0[1], v0[1]
59-
; CHECK-NEXT: mov.h v0[2], w8
44+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
45+
; CHECK-NEXT: fmov s0, w8
46+
; CHECK-NEXT: zip1.8b v0, v0, v0
6047
; CHECK-NEXT: ushll.4s v0, v0, #0
6148
; CHECK-NEXT: and.16b v0, v0, v1
62-
; CHECK-NEXT: add sp, sp, #16
6349
; CHECK-NEXT: ret
6450
;
6551
; BE-LABEL: load_v3i8_to_4xi32:
@@ -193,19 +179,15 @@ entry:
193179
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
194180
; CHECK-LABEL: load_ext_to_64bits:
195181
; CHECK: ; %bb.0: ; %entry
196-
; CHECK-NEXT: sub sp, sp, #16
197-
; CHECK-NEXT: .cfi_def_cfa_offset 16
198-
; CHECK-NEXT: ldrh w8, [x0]
199-
; CHECK-NEXT: strh w8, [sp, #12]
200-
; CHECK-NEXT: add x8, x0, #2
201-
; CHECK-NEXT: ldr s0, [sp, #12]
202-
; CHECK-NEXT: ushll.8h v0, v0, #0
203-
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
182+
; CHECK-NEXT: ldrb w8, [x0, #2]
183+
; CHECK-NEXT: ldrh w9, [x0]
184+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
185+
; CHECK-NEXT: fmov s0, w8
204186
; CHECK-NEXT: add x8, x1, #4
187+
; CHECK-NEXT: zip1.8b v0, v0, v0
205188
; CHECK-NEXT: bic.4h v0, #255, lsl #8
206189
; CHECK-NEXT: st1.h { v0 }[2], [x8]
207190
; CHECK-NEXT: str s0, [x1]
208-
; CHECK-NEXT: add sp, sp, #16
209191
; CHECK-NEXT: ret
210192
;
211193
; BE-LABEL: load_ext_to_64bits:

0 commit comments

Comments
 (0)