Skip to content

Commit a6853cd

Browse files
authored
[NVPTX] Auto-Upgrade llvm.nvvm.atomic.load.{inc,dec}.32 (#134111)
These intrinsics can be upgrade to an atomicrmw instruction.
1 parent c13436e commit a6853cd

File tree

10 files changed

+107
-59
lines changed

10 files changed

+107
-59
lines changed

clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp

+4-14
Original file line numberDiff line numberDiff line change
@@ -481,21 +481,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
481481
AtomicOrdering::SequentiallyConsistent);
482482
}
483483

484-
case NVPTX::BI__nvvm_atom_inc_gen_ui: {
485-
Value *Ptr = EmitScalarExpr(E->getArg(0));
486-
Value *Val = EmitScalarExpr(E->getArg(1));
487-
Function *FnALI32 =
488-
CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
489-
return Builder.CreateCall(FnALI32, {Ptr, Val});
490-
}
484+
case NVPTX::BI__nvvm_atom_inc_gen_ui:
485+
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E);
491486

492-
case NVPTX::BI__nvvm_atom_dec_gen_ui: {
493-
Value *Ptr = EmitScalarExpr(E->getArg(0));
494-
Value *Val = EmitScalarExpr(E->getArg(1));
495-
Function *FnALD32 =
496-
CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
497-
return Builder.CreateCall(FnALD32, {Ptr, Val});
498-
}
487+
case NVPTX::BI__nvvm_atom_dec_gen_ui:
488+
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E);
499489

500490
case NVPTX::BI__nvvm_ldg_c:
501491
case NVPTX::BI__nvvm_ldg_sc:

clang/test/CodeGen/builtins-nvptx.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -333,10 +333,10 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
333333
// CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
334334
__nvvm_atom_add_gen_f(fp, f);
335335

336-
// CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0
336+
// CHECK: atomicrmw uinc_wrap ptr {{.*}} seq_cst, align 4
337337
__nvvm_atom_inc_gen_ui(uip, ui);
338338

339-
// CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0
339+
// CHECK: atomicrmw udec_wrap ptr {{.*}} seq_cst, align 4
340340
__nvvm_atom_dec_gen_ui(uip, ui);
341341

342342

llvm/include/llvm/IR/IntrinsicsNVVM.td

+2-8
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@
124124
// * llvm.nvvm.ldg.global.f --> ibid.
125125
// * llvm.nvvm.ldg.global.p --> ibid.
126126
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
127+
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
128+
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
127129

128130
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
129131
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
@@ -1633,14 +1635,6 @@ let TargetPrefix = "nvvm" in {
16331635
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
16341636
[IntrNoMem]>;
16351637

1636-
// Atomics not available as llvm intrinsics.
1637-
def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
1638-
[llvm_anyptr_ty, llvm_i32_ty],
1639-
[IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
1640-
def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty],
1641-
[llvm_anyptr_ty, llvm_i32_ty],
1642-
[IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
1643-
16441638
class SCOPED_ATOMIC2_impl<LLVMType elty>
16451639
: Intrinsic<[elty],
16461640
[llvm_anyptr_ty, LLVMMatchType<0>],

llvm/include/llvm/Target/TargetSelectionDAG.td

+2
Original file line numberDiff line numberDiff line change
@@ -1825,6 +1825,8 @@ defm atomic_load_min : binary_atomic_op<atomic_load_min>;
18251825
defm atomic_load_max : binary_atomic_op<atomic_load_max>;
18261826
defm atomic_load_umin : binary_atomic_op<atomic_load_umin>;
18271827
defm atomic_load_umax : binary_atomic_op<atomic_load_umax>;
1828+
defm atomic_load_uinc_wrap : binary_atomic_op<atomic_load_uinc_wrap>;
1829+
defm atomic_load_udec_wrap : binary_atomic_op<atomic_load_udec_wrap>;
18281830
defm atomic_cmp_swap : ternary_atomic_op<atomic_cmp_swap>;
18291831

18301832
/// Atomic load which zeroes the excess high bits.

llvm/lib/IR/AutoUpgrade.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -1302,6 +1302,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13021302
else if (Name.consume_front("atomic.load.add."))
13031303
// nvvm.atomic.load.add.{f32.p,f64.p}
13041304
Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p");
1305+
else if (Name.consume_front("atomic.load.") && Name.consume_back(".32"))
1306+
// nvvm.atomic.load.{inc,dec}.32
1307+
Expand = Name == "inc" || Name == "dec";
13051308
else if (Name.consume_front("bitcast."))
13061309
// nvvm.bitcast.{f2i,i2f,ll2d,d2ll}
13071310
Expand =
@@ -2314,6 +2317,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
23142317
Value *Val = CI->getArgOperand(1);
23152318
Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
23162319
AtomicOrdering::SequentiallyConsistent);
2320+
} else if (Name.consume_front("atomic.load.") && Name.consume_back(".32")) {
2321+
Value *Ptr = CI->getArgOperand(0);
2322+
Value *Val = CI->getArgOperand(1);
2323+
auto Op = Name == "inc" ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap;
2324+
Rep = Builder.CreateAtomicRMW(Op, Ptr, Val, MaybeAlign(),
2325+
AtomicOrdering::SequentiallyConsistent);
23172326
} else if (Name.consume_front("max.") &&
23182327
(Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
23192328
Name == "ui" || Name == "ull")) {

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+12-3
Original file line numberDiff line numberDiff line change
@@ -4067,9 +4067,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
40674067
return true;
40684068
}
40694069

4070-
case Intrinsic::nvvm_atomic_load_inc_32:
4071-
case Intrinsic::nvvm_atomic_load_dec_32:
4072-
40734070
case Intrinsic::nvvm_atomic_add_gen_f_cta:
40744071
case Intrinsic::nvvm_atomic_add_gen_f_sys:
40754072
case Intrinsic::nvvm_atomic_add_gen_i_cta:
@@ -6145,6 +6142,18 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
61456142
default:
61466143
llvm_unreachable("unsupported width encountered");
61476144
}
6145+
case AtomicRMWInst::BinOp::UIncWrap:
6146+
case AtomicRMWInst::BinOp::UDecWrap:
6147+
switch (ITy->getBitWidth()) {
6148+
case 32:
6149+
return AtomicExpansionKind::None;
6150+
case 8:
6151+
case 16:
6152+
case 64:
6153+
return AtomicExpansionKind::CmpXChg;
6154+
default:
6155+
llvm_unreachable("unsupported width encountered");
6156+
}
61486157
}
61496158

61506159
return AtomicExpansionKind::CmpXChg;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

+2-2
Original file line numberDiff line numberDiff line change
@@ -2070,8 +2070,8 @@ defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umin_i32, "min.u3
20702070
defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umin_i64, "min.u64", [hasSM<32>]>;
20712071

20722072
// atom_inc atom_dec
2073-
defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_inc_32, "inc.u32">;
2074-
defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_dec_32, "dec.u32">;
2073+
defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_uinc_wrap_i32, "inc.u32">;
2074+
defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_udec_wrap_i32, "dec.u32">;
20752075

20762076
// atom_and
20772077
defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS<I32RT, atomic_load_and_i32, "and.b32">;

llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

+25-27
Original file line numberDiff line numberDiff line change
@@ -46,33 +46,31 @@ static bool readsLaneId(const IntrinsicInst *II) {
4646
// Whether the given intrinsic is an atomic instruction in PTX.
4747
static bool isNVVMAtomic(const IntrinsicInst *II) {
4848
switch (II->getIntrinsicID()) {
49-
default: return false;
50-
case Intrinsic::nvvm_atomic_load_inc_32:
51-
case Intrinsic::nvvm_atomic_load_dec_32:
52-
53-
case Intrinsic::nvvm_atomic_add_gen_f_cta:
54-
case Intrinsic::nvvm_atomic_add_gen_f_sys:
55-
case Intrinsic::nvvm_atomic_add_gen_i_cta:
56-
case Intrinsic::nvvm_atomic_add_gen_i_sys:
57-
case Intrinsic::nvvm_atomic_and_gen_i_cta:
58-
case Intrinsic::nvvm_atomic_and_gen_i_sys:
59-
case Intrinsic::nvvm_atomic_cas_gen_i_cta:
60-
case Intrinsic::nvvm_atomic_cas_gen_i_sys:
61-
case Intrinsic::nvvm_atomic_dec_gen_i_cta:
62-
case Intrinsic::nvvm_atomic_dec_gen_i_sys:
63-
case Intrinsic::nvvm_atomic_inc_gen_i_cta:
64-
case Intrinsic::nvvm_atomic_inc_gen_i_sys:
65-
case Intrinsic::nvvm_atomic_max_gen_i_cta:
66-
case Intrinsic::nvvm_atomic_max_gen_i_sys:
67-
case Intrinsic::nvvm_atomic_min_gen_i_cta:
68-
case Intrinsic::nvvm_atomic_min_gen_i_sys:
69-
case Intrinsic::nvvm_atomic_or_gen_i_cta:
70-
case Intrinsic::nvvm_atomic_or_gen_i_sys:
71-
case Intrinsic::nvvm_atomic_exch_gen_i_cta:
72-
case Intrinsic::nvvm_atomic_exch_gen_i_sys:
73-
case Intrinsic::nvvm_atomic_xor_gen_i_cta:
74-
case Intrinsic::nvvm_atomic_xor_gen_i_sys:
75-
return true;
49+
default:
50+
return false;
51+
case Intrinsic::nvvm_atomic_add_gen_f_cta:
52+
case Intrinsic::nvvm_atomic_add_gen_f_sys:
53+
case Intrinsic::nvvm_atomic_add_gen_i_cta:
54+
case Intrinsic::nvvm_atomic_add_gen_i_sys:
55+
case Intrinsic::nvvm_atomic_and_gen_i_cta:
56+
case Intrinsic::nvvm_atomic_and_gen_i_sys:
57+
case Intrinsic::nvvm_atomic_cas_gen_i_cta:
58+
case Intrinsic::nvvm_atomic_cas_gen_i_sys:
59+
case Intrinsic::nvvm_atomic_dec_gen_i_cta:
60+
case Intrinsic::nvvm_atomic_dec_gen_i_sys:
61+
case Intrinsic::nvvm_atomic_inc_gen_i_cta:
62+
case Intrinsic::nvvm_atomic_inc_gen_i_sys:
63+
case Intrinsic::nvvm_atomic_max_gen_i_cta:
64+
case Intrinsic::nvvm_atomic_max_gen_i_sys:
65+
case Intrinsic::nvvm_atomic_min_gen_i_cta:
66+
case Intrinsic::nvvm_atomic_min_gen_i_sys:
67+
case Intrinsic::nvvm_atomic_or_gen_i_cta:
68+
case Intrinsic::nvvm_atomic_or_gen_i_sys:
69+
case Intrinsic::nvvm_atomic_exch_gen_i_cta:
70+
case Intrinsic::nvvm_atomic_exch_gen_i_sys:
71+
case Intrinsic::nvvm_atomic_xor_gen_i_cta:
72+
case Intrinsic::nvvm_atomic_xor_gen_i_sys:
73+
return true;
7674
}
7775
}
7876

llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll

+15-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ declare i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr, i32)
5252
declare ptr @llvm.nvvm.ldg.global.p.p0(ptr, i32)
5353
declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32)
5454

55+
declare i32 @llvm.nvvm.atomic.load.inc.32(ptr, i32)
56+
declare i32 @llvm.nvvm.atomic.load.dec.32(ptr, i32)
57+
5558
; CHECK-LABEL: @simple_upgrade
5659
define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
5760
; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -224,4 +227,15 @@ define void @ldg(ptr %p0, ptr addrspace(1) %p1) {
224227
%v6 = call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %p0, i32 16)
225228

226229
ret void
227-
}
230+
}
231+
232+
; CHECK-LABEL: @atomics
233+
define i32 @atomics(ptr %p0, i32 %a) {
234+
; CHECK: %1 = atomicrmw uinc_wrap ptr %p0, i32 %a seq_cst
235+
; CHECK: %2 = atomicrmw udec_wrap ptr %p0, i32 %a seq_cst
236+
237+
%r1 = call i32 @llvm.nvvm.atomic.load.inc.32(ptr %p0, i32 %a)
238+
%r2 = call i32 @llvm.nvvm.atomic.load.dec.32(ptr %p0, i32 %a)
239+
ret i32 %r2
240+
}
241+

llvm/test/CodeGen/NVPTX/atomics.ll

+34-2
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,38 @@ define i64 @atom19(ptr %subr, i64 %val) {
313313
ret i64 %ret
314314
}
315315

316+
define i32 @atom20(ptr %subr, i32 %val) {
317+
; CHECK-LABEL: atom20(
318+
; CHECK: {
319+
; CHECK-NEXT: .reg .b32 %r<3>;
320+
; CHECK-NEXT: .reg .b64 %rd<2>;
321+
; CHECK-EMPTY:
322+
; CHECK-NEXT: // %bb.0:
323+
; CHECK-NEXT: ld.param.u64 %rd1, [atom20_param_0];
324+
; CHECK-NEXT: ld.param.u32 %r1, [atom20_param_1];
325+
; CHECK-NEXT: atom.inc.u32 %r2, [%rd1], %r1;
326+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
327+
; CHECK-NEXT: ret;
328+
%ret = atomicrmw uinc_wrap ptr %subr, i32 %val seq_cst
329+
ret i32 %ret
330+
}
331+
332+
define i32 @atom21(ptr %subr, i32 %val) {
333+
; CHECK-LABEL: atom21(
334+
; CHECK: {
335+
; CHECK-NEXT: .reg .b32 %r<3>;
336+
; CHECK-NEXT: .reg .b64 %rd<2>;
337+
; CHECK-EMPTY:
338+
; CHECK-NEXT: // %bb.0:
339+
; CHECK-NEXT: ld.param.u64 %rd1, [atom21_param_0];
340+
; CHECK-NEXT: ld.param.u32 %r1, [atom21_param_1];
341+
; CHECK-NEXT: atom.dec.u32 %r2, [%rd1], %r1;
342+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
343+
; CHECK-NEXT: ret;
344+
%ret = atomicrmw udec_wrap ptr %subr, i32 %val seq_cst
345+
ret i32 %ret
346+
}
347+
316348
declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
317349

318350
; CHECK-LABEL: atomic_add_f32_generic
@@ -409,7 +441,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
409441
; CHECK-NEXT: not.b32 %r2, %r9;
410442
; CHECK-NEXT: ld.u32 %r16, [%rd1];
411443
; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
412-
; CHECK-NEXT: $L__BB22_1: // %atomicrmw.start
444+
; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start
413445
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
414446
; CHECK-NEXT: shr.u32 %r10, %r16, %r1;
415447
; CHECK-NEXT: cvt.u16.u32 %rs2, %r10;
@@ -424,7 +456,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
424456
; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14;
425457
; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16;
426458
; CHECK-NEXT: mov.b32 %r16, %r5;
427-
; CHECK-NEXT: @%p1 bra $L__BB22_1;
459+
; CHECK-NEXT: @%p1 bra $L__BB24_1;
428460
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
429461
; CHECK-NEXT: shr.u32 %r15, %r5, %r1;
430462
; CHECK-NEXT: cvt.u16.u32 %rs4, %r15;

0 commit comments

Comments
 (0)