Make sure that the proper video stream index is used by the GPU decoder

JanuszL · JanuszL · commit eada8b28e9ac · 2024-10-17T19:14:44.000+02:00
- fixes the use of 0 index stream inside the GPU decoder by properly
  obtained index

Signed-off-by: Janusz Lisiecki &lt;jlisiecki@nvidia.com&gt;
diff --git a/dali/operators/reader/loader/video/frames_decoder.h b/dali/operators/reader/loader/video/frames_decoder.h
@@ -219,6 +219,9 @@ class DLL_PUBLIC FramesDecoder {
 
   std::optional<bool> zero_latency_ = {};
 
+  // False when the file doesn't have any correct content or doesn't have valid video stream
+  bool is_valid_ = false;
+
  private:
    /**
    * @brief Gets the packet from the decoder and reads a frame from it to provided buffer. Returns
@@ -275,8 +278,6 @@ class DLL_PUBLIC FramesDecoder {
   int channels_ = 3;
   bool flush_state_ = false;
   bool is_vfr_ = false;
-  // False when the file doesn't have any correct content or doesn't have valid video stream
-  bool is_valid_ = false;
 
   const std::string filename_ = {};
   std::optional<MemoryVideoFile> memory_video_file_ = {};
diff --git a/dali/operators/reader/loader/video/frames_decoder_gpu.cc b/dali/operators/reader/loader/video/frames_decoder_gpu.cc
@@ -331,7 +331,8 @@ void FramesDecoderGpu::InitBitStreamFilter() {
   }
 
   DALI_ENFORCE(
-    avcodec_parameters_copy(bsfc_->par_in, av_state_->ctx_->streams[0]->codecpar) >= 0,
+    avcodec_parameters_copy(bsfc_->par_in,
+                            av_state_->ctx_->streams[av_state_->stream_id_]->codecpar) >= 0,
     "Unable to copy bit stream filter parameters");
   DALI_ENFORCE(
     av_bsf_init(bsfc_) >= 0,
@@ -364,7 +365,11 @@ void FramesDecoderGpu::InitGpuParser() {
   InitBitStreamFilter();
 
   filtered_packet_ = av_packet_alloc();
-  DALI_ENFORCE(filtered_packet_, "Could not allocate av packet");
+  if (!filtered_packet_) {
+    DALI_WARN(make_string("Could not allocate av packet for \"", Filename(), "\""));
+    is_valid_ = false;
+    return;
+  }
 
   auto codec_type = GetCodecType();
 
@@ -380,8 +385,8 @@ void FramesDecoderGpu::InitGpuParser() {
   parser_info.pfnDecodePicture = frame_dec_gpu_impl::process_picture_decode;
   parser_info.pfnDisplayPicture = nullptr;
 
-  auto extradata = av_state_->ctx_->streams[0]->codecpar->extradata;
-  auto extradata_size = av_state_->ctx_->streams[0]->codecpar->extradata_size;
+  auto extradata = av_state_->ctx_->streams[av_state_->stream_id_]->codecpar->extradata;
+  auto extradata_size = av_state_->ctx_->streams[av_state_->stream_id_]->codecpar->extradata_size;
 
   memset(&parser_extinfo, 0, sizeof(parser_extinfo));
   parser_info.pExtVideoInfo = &parser_extinfo;
diff --git a/dali/test/python/input/test_video.py b/dali/test/python/input/test_video.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import glob
+import os
 import itertools
 import numpy as np
 import nvidia.dali.fn as fn
@@ -22,7 +23,8 @@
 from nvidia.dali import pipeline_def
 from test_utils import get_dali_extra_path, to_array
 
-filenames = glob.glob(f"{get_dali_extra_path()}/db/video/[cv]fr/*.mp4")
+test_data_root = get_dali_extra_path()
+filenames = glob.glob(f"{test_data_root}/db/video/[cv]fr/*.mp4")
 # filter out HEVC because some GPUs do not support it
 filenames = filter(lambda filename: "hevc" not in filename, filenames)
 # mpeg4 is not yet supported in the CPU operator
@@ -254,3 +256,26 @@ def test_video_input_input_queue(device, n_test_files):
         glob="No data was provided to the InputOperator. Make sure to feed it properly.",
     ):
         input_pipe.run()
+
+
+@params(*device_values)
+def test_video_input_audio_stream(device):
+    """
+    Checks if video decoding when audio stream is present
+    """
+    input_name = "VIDEO_INPUT"
+
+    input_pipe = video_input_pipeline(
+        input_name=input_name,
+        batch_size=3,
+        sequence_length=4,
+        device=device,
+        **common_pipeline_params,
+    )
+
+    filename = os.path.join(test_data_root, "db", "video", "sintel", "sintel_trailer-720p.mp4")
+    test_file = np.fromfile(filename, dtype=np.uint8)
+    input_pipe.build()
+    input_pipe.feed_input(input_name, np.array([[test_file]]))
+
+    input_pipe.run()