Now only taskloops whose loop iter is used in multideps have num_deps = -1

mikaoP · mikaoP · commit 0e18e971fc93 · 2020-11-13T08:13:58.000Z
Closes llvm#101
diff --git a/llvm/lib/Transforms/OmpSs/OmpSsTransform.cpp b/llvm/lib/Transforms/OmpSs/OmpSsTransform.cpp
@@ -1726,6 +1726,19 @@ struct OmpSs : public ModulePass {
     return OlConstraintsFuncVar;
   }
 
+  bool hasMultidepUsingLoopIter(
+      const DirectiveLoopInfo &LoopInfo, const DirectiveDependsInfo &DependsInfo) {
+    for (auto &DepInfo : DependsInfo.List) {
+      if (const auto *MultiDepInfo = dyn_cast<MultiDependInfo>(DepInfo.get())) {
+        for (const auto *V : MultiDepInfo->Args) {
+          if (V == LoopInfo.IndVar)
+            return true;
+        }
+      }
+    }
+    return false;
+  }
+
   Function *createPriorityOlFunc(
       Module &M, Function &F, int taskNum,
       const MapVector<Value *, size_t> &TaskArgsToStructIdxMap,
@@ -2153,6 +2166,7 @@ struct OmpSs : public ModulePass {
       const DirectiveVLADimsInfo &VLADimsInfo = DirEnv.VLADimsInfo;
       const DirectiveCapturedInfo &CapturedInfo = DirEnv.CapturedInfo;
       const DirectiveDependsInfo &DependsInfo = DirEnv.DependsInfo;
+      const DirectiveLoopInfo &LoopInfo = DirEnv.LoopInfo;
 
       IRBuilder<> IRB(codeReplacer);
       // Set debug info from the task entry to all instructions
@@ -2210,8 +2224,9 @@ struct OmpSs : public ModulePass {
       Instruction *NumDependencies = IRB.CreateAlloca(IRB.getInt64Ty(), nullptr, "num.deps");
       PostMoveInstructions.push_back(NumDependencies);
 
-      if (DirEnv.isOmpSsTaskLoopDirective()) {
-        // If taskloop NumDeps = -1
+      if (DirEnv.isOmpSsTaskLoopDirective() && hasMultidepUsingLoopIter(LoopInfo, DependsInfo)) {
+        // If taskloop has a multidep using the loop iterator
+        // NumDeps = -1
         IRB.CreateStore(IRB.getInt64(-1), NumDependencies);
       } else {
         IRB.CreateStore(IRB.getInt64(0), NumDependencies);
diff --git a/llvm/test/Transforms/OmpSs/loop_directives_num_deps.ll b/llvm/test/Transforms/OmpSs/loop_directives_num_deps.ll
@@ -5,8 +5,6 @@ source_filename = "loop_directives_num_deps.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; taskloop directives have -1 num dependencies
-
 ; void foo(int lb, int ub, int step) {
 ;     #pragma oss task for
 ;     for (int i = 0; i < 10; i += 1) {}
@@ -63,7 +61,7 @@ define void @foo(i32 %lb, i32 %ub, i32 %step) #0 !dbg !6 {
 ; CHECK-NEXT:    br label [[FINAL_COND12:%.*]], [[DBG10]]
 ; CHECK:       codeRepl17:
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast %nanos6_task_args_foo1** [[TMP2]] to i8**, [[DBG10]]
-; CHECK-NEXT:    store i64 -1, i64* [[NUM_DEPS20]], align 8, [[DBG10]]
+; CHECK-NEXT:    store i64 0, i64* [[NUM_DEPS20]], align 8, [[DBG10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[NUM_DEPS20]], align 8, [[DBG10]]
 ; CHECK-NEXT:    call void @nanos6_create_task(%nanos6_task_info_t* @task_info_var_foo1, %nanos6_task_invocation_info_t* @task_invocation_info_foo1, i64 16, i8** [[TMP11]], i8** [[TMP3]], i64 4, i64 [[TMP12]]), [[DBG10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load %nanos6_task_args_foo1*, %nanos6_task_args_foo1** [[TMP2]], align 8, [[DBG10]]
@@ -84,7 +82,7 @@ define void @foo(i32 %lb, i32 %ub, i32 %step) #0 !dbg !6 {
 ; CHECK-NEXT:    br label [[FINAL_COND29:%.*]], [[DBG11]]
 ; CHECK:       codeRepl34:
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast %nanos6_task_args_foo2** [[TMP4]] to i8**, [[DBG11]]
-; CHECK-NEXT:    store i64 -1, i64* [[NUM_DEPS37]], align 8, [[DBG11]]
+; CHECK-NEXT:    store i64 0, i64* [[NUM_DEPS37]], align 8, [[DBG11]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[NUM_DEPS37]], align 8, [[DBG11]]
 ; CHECK-NEXT:    call void @nanos6_create_task(%nanos6_task_info_t* @task_info_var_foo2, %nanos6_task_invocation_info_t* @task_invocation_info_foo2, i64 16, i8** [[TMP16]], i8** [[TMP5]], i64 12, i64 [[TMP17]]), [[DBG11]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = load %nanos6_task_args_foo2*, %nanos6_task_args_foo2** [[TMP4]], align 8, [[DBG11]]
diff --git a/llvm/test/Transforms/OmpSs/taskloop_final_loop.ll b/llvm/test/Transforms/OmpSs/taskloop_final_loop.ll
@@ -44,7 +44,7 @@ define dso_local void @taskloop(i32 %lb, i32 %ub, i32 %step) #0 !dbg !6 {
 ; CHECK-NEXT:    br label [[FINAL_COND:%.*]], [[DBG8]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %nanos6_task_args_taskloop0** [[TMP0]] to i8**, [[DBG8]]
-; CHECK-NEXT:    store i64 -1, i64* [[NUM_DEPS]], align 8, [[DBG8]]
+; CHECK-NEXT:    store i64 0, i64* [[NUM_DEPS]], align 8, [[DBG8]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[NUM_DEPS]], align 8, [[DBG8]]
 ; CHECK-NEXT:    call void @nanos6_create_task(%nanos6_task_info_t* @task_info_var_taskloop0, %nanos6_task_invocation_info_t* @task_invocation_info_taskloop0, i64 16, i8** [[TMP2]], i8** [[TMP1]], i64 4, i64 [[TMP3]]), [[DBG8]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load %nanos6_task_args_taskloop0*, %nanos6_task_args_taskloop0** [[TMP0]], align 8, [[DBG8]]
diff --git a/llvm/test/Transforms/OmpSs/taskloop_multideps.ll b/llvm/test/Transforms/OmpSs/taskloop_multideps.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt %s -ompss-2 -S | FileCheck %s
 ; ModuleID = 'taskloop_multideps.ll'
 source_filename = "taskloop_multideps.ll"
@@ -7,6 +8,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; This test checks we use nanos6 lower bound to build
 ; multidep loop and call to register dep
 
+; Also, taskloop having multideps using the loop iterator
+; means num_deps = -1
+
 ; int v[10];
 ; int main() {
 ;     #pragma oss taskloop out( { v[i], i=0;j } )
@@ -20,6 +24,57 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: noinline nounwind optnone
 define i32 @main() #0 !dbg !6 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[J:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, i32* [[J]], align 4, [[DBG9:!dbg !.*]]
+; CHECK-NEXT:    store i32 0, i32* [[I]], align 4, [[DBG10:!dbg !.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca %nanos6_task_args_main0*, align 8, [[DBG9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca i8*, align 8, [[DBG9]]
+; CHECK-NEXT:    [[NUM_DEPS:%.*]] = alloca i64, align 8, [[DBG9]]
+; CHECK-NEXT:    br label [[FINAL_COND:%.*]], [[DBG9]]
+; CHECK:       codeRepl:
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %nanos6_task_args_main0** [[TMP0]] to i8**, [[DBG9]]
+; CHECK-NEXT:    store i64 -1, i64* [[NUM_DEPS]], align 8, [[DBG9]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[NUM_DEPS]], align 8, [[DBG9]]
+; CHECK-NEXT:    call void @nanos6_create_task(%nanos6_task_info_t* @task_info_var_main0, %nanos6_task_invocation_info_t* @task_invocation_info_main0, i64 32, i8** [[TMP2]], i8** [[TMP1]], i64 4, i64 [[TMP3]]), [[DBG9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load %nanos6_task_args_main0*, %nanos6_task_args_main0** [[TMP0]], align 8, [[DBG9]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %nanos6_task_args_main0* [[TMP4]] to i8*, [[DBG9]]
+; CHECK-NEXT:    [[ARGS_END:%.*]] = getelementptr i8, i8* [[TMP5]], i64 32, [[DBG9]]
+; CHECK-NEXT:    [[GEP_V:%.*]] = getelementptr [[NANOS6_TASK_ARGS_MAIN0:%.*]], %nanos6_task_args_main0* [[TMP4]], i32 0, i32 0, [[DBG9]]
+; CHECK-NEXT:    store [10 x i32]* @v, [10 x i32]** [[GEP_V]], align 8, [[DBG9]]
+; CHECK-NEXT:    [[CAPT_GEP_:%.*]] = getelementptr [[NANOS6_TASK_ARGS_MAIN0]], %nanos6_task_args_main0* [[TMP4]], i32 0, i32 3, [[DBG9]]
+; CHECK-NEXT:    store i32 0, i32* [[CAPT_GEP_]], align 4, [[DBG9]]
+; CHECK-NEXT:    [[CAPT_GEP_4:%.*]] = getelementptr [[NANOS6_TASK_ARGS_MAIN0]], %nanos6_task_args_main0* [[TMP4]], i32 0, i32 4, [[DBG9]]
+; CHECK-NEXT:    store i32 10, i32* [[CAPT_GEP_4]], align 4, [[DBG9]]
+; CHECK-NEXT:    [[CAPT_GEP_5:%.*]] = getelementptr [[NANOS6_TASK_ARGS_MAIN0]], %nanos6_task_args_main0* [[TMP4]], i32 0, i32 5, [[DBG9]]
+; CHECK-NEXT:    store i32 1, i32* [[CAPT_GEP_5]], align 4, [[DBG9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP1]], align 8, [[DBG9]]
+; CHECK-NEXT:    call void @nanos6_register_loop_bounds(i8* [[TMP6]], i64 0, i64 10, i64 0, i64 0), [[DBG9]]
+; CHECK-NEXT:    call void @nanos6_submit_task(i8* [[TMP6]]), [[DBG9]]
+; CHECK-NEXT:    br label [[FINAL_END:%.*]], [[DBG9]]
+; CHECK:       final.end:
+; CHECK-NEXT:    ret i32 0, [[DBG11:!dbg !.*]]
+; CHECK:       final.then:
+; CHECK-NEXT:    store i32 0, i32* [[J]], align 4, [[DBG9]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], [[DBG9]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[J]], align 4, [[DBG9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i32 [[TMP7]], 10, [[DBG9]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_BODY:%.*]], label [[FINAL_END]], [[DBG9]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br label [[FOR_INCR:%.*]], [[DBG11]]
+; CHECK:       for.incr:
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[J]], align 4, [[DBG9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 1, [[DBG9]]
+; CHECK-NEXT:    store i32 [[TMP10]], i32* [[J]], align 4, [[DBG9]]
+; CHECK-NEXT:    br label [[FOR_COND]], [[DBG9]]
+; CHECK:       final.cond:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @nanos6_in_final(), [[DBG9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0, [[DBG9]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[FINAL_THEN:%.*]], label [[CODEREPL:%.*]], [[DBG9]]
+;
 entry:
   %j = alloca i32, align 4
   %i = alloca i32, align 4
@@ -37,6 +92,24 @@ declare token @llvm.directive.region.entry() #1
 declare void @llvm.directive.region.exit(token) #1
 
 define internal %struct._depend_unpack_t @compute_dep(i32* %i, i32* %j) {
+; CHECK-LABEL: @compute_dep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETURN_VAL:%.*]] = alloca [[STRUCT__DEPEND_UNPACK_T:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4, [[DBG10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[J:%.*]], align 4, [[DBG10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T]], %struct._depend_unpack_t* [[RETURN_VAL]], i32 0, i32 0
+; CHECK-NEXT:    store i32 0, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T]], %struct._depend_unpack_t* [[RETURN_VAL]], i32 0, i32 1
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T]], %struct._depend_unpack_t* [[RETURN_VAL]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T]], %struct._depend_unpack_t* [[RETURN_VAL]], i32 0, i32 3
+; CHECK-NEXT:    store i32 1, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT__DEPEND_UNPACK_T]], %struct._depend_unpack_t* [[RETURN_VAL]], align 4
+; CHECK-NEXT:    ret [[STRUCT__DEPEND_UNPACK_T]] [[TMP8]]
+;
 entry:
   %return.val = alloca %struct._depend_unpack_t, align 4
   %0 = load i32, i32* %i, align 4, !dbg !10
@@ -56,6 +129,26 @@ entry:
 }
 
 define internal %struct._depend_unpack_t.0 @compute_dep.1(i32* %i, i32* %j, [10 x i32]* %v) {
+; CHECK-LABEL: @compute_dep.1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RETURN_VAL:%.*]] = alloca [[STRUCT__DEPEND_UNPACK_T_0:%.*]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[I:%.*]], align 4, [[DBG10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[V:%.*]], i64 0, i64 0, [[DBG10]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T_0]], %struct._depend_unpack_t.0* [[RETURN_VAL]], i32 0, i32 0
+; CHECK-NEXT:    store i32* [[ARRAYDECAY]], i32** [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T_0]], %struct._depend_unpack_t.0* [[RETURN_VAL]], i32 0, i32 1
+; CHECK-NEXT:    store i64 40, i64* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T_0]], %struct._depend_unpack_t.0* [[RETURN_VAL]], i32 0, i32 2
+; CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__DEPEND_UNPACK_T_0]], %struct._depend_unpack_t.0* [[RETURN_VAL]], i32 0, i32 3
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT__DEPEND_UNPACK_T_0]], %struct._depend_unpack_t.0* [[RETURN_VAL]], align 8
+; CHECK-NEXT:    ret [[STRUCT__DEPEND_UNPACK_T_0]] [[TMP9]]
+;
 entry:
   %return.val = alloca %struct._depend_unpack_t.0, align 8
   %0 = load i32, i32* %i, align 4, !dbg !10