xref: /aosp_15_r20/external/webrtc/modules/rtp_rtcp/source/video_rtp_depacketizer_h264.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 /*
2  *  Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_h264.h"
12 
13 #include <cstddef>
14 #include <cstdint>
15 #include <utility>
16 #include <vector>
17 
18 #include "absl/types/optional.h"
19 #include "absl/types/variant.h"
20 #include "common_video/h264/h264_common.h"
21 #include "common_video/h264/pps_parser.h"
22 #include "common_video/h264/sps_parser.h"
23 #include "common_video/h264/sps_vui_rewriter.h"
24 #include "modules/rtp_rtcp/source/byte_io.h"
25 #include "modules/rtp_rtcp/source/rtp_format_h264.h"
26 #include "modules/rtp_rtcp/source/video_rtp_depacketizer.h"
27 #include "rtc_base/checks.h"
28 #include "rtc_base/copy_on_write_buffer.h"
29 #include "rtc_base/logging.h"
30 
31 namespace webrtc {
32 namespace {
33 
34 constexpr size_t kNalHeaderSize = 1;
35 constexpr size_t kFuAHeaderSize = 2;
36 constexpr size_t kLengthFieldSize = 2;
37 constexpr size_t kStapAHeaderSize = kNalHeaderSize + kLengthFieldSize;
38 
39 // TODO(pbos): Avoid parsing this here as well as inside the jitter buffer.
ParseStapAStartOffsets(const uint8_t * nalu_ptr,size_t length_remaining,std::vector<size_t> * offsets)40 bool ParseStapAStartOffsets(const uint8_t* nalu_ptr,
41                             size_t length_remaining,
42                             std::vector<size_t>* offsets) {
43   size_t offset = 0;
44   while (length_remaining > 0) {
45     // Buffer doesn't contain room for additional nalu length.
46     if (length_remaining < sizeof(uint16_t))
47       return false;
48     uint16_t nalu_size = ByteReader<uint16_t>::ReadBigEndian(nalu_ptr);
49     nalu_ptr += sizeof(uint16_t);
50     length_remaining -= sizeof(uint16_t);
51     if (nalu_size > length_remaining)
52       return false;
53     nalu_ptr += nalu_size;
54     length_remaining -= nalu_size;
55 
56     offsets->push_back(offset + kStapAHeaderSize);
57     offset += kLengthFieldSize + nalu_size;
58   }
59   return true;
60 }
61 
ProcessStapAOrSingleNalu(rtc::CopyOnWriteBuffer rtp_payload)62 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> ProcessStapAOrSingleNalu(
63     rtc::CopyOnWriteBuffer rtp_payload) {
64   const uint8_t* const payload_data = rtp_payload.cdata();
65   absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload(
66       absl::in_place);
67   bool modified_buffer = false;
68   parsed_payload->video_payload = rtp_payload;
69   parsed_payload->video_header.width = 0;
70   parsed_payload->video_header.height = 0;
71   parsed_payload->video_header.codec = kVideoCodecH264;
72   parsed_payload->video_header.simulcastIdx = 0;
73   parsed_payload->video_header.is_first_packet_in_frame = true;
74   auto& h264_header = parsed_payload->video_header.video_type_header
75                           .emplace<RTPVideoHeaderH264>();
76 
77   const uint8_t* nalu_start = payload_data + kNalHeaderSize;
78   const size_t nalu_length = rtp_payload.size() - kNalHeaderSize;
79   uint8_t nal_type = payload_data[0] & kH264TypeMask;
80   std::vector<size_t> nalu_start_offsets;
81   if (nal_type == H264::NaluType::kStapA) {
82     // Skip the StapA header (StapA NAL type + length).
83     if (rtp_payload.size() <= kStapAHeaderSize) {
84       RTC_LOG(LS_ERROR) << "StapA header truncated.";
85       return absl::nullopt;
86     }
87 
88     if (!ParseStapAStartOffsets(nalu_start, nalu_length, &nalu_start_offsets)) {
89       RTC_LOG(LS_ERROR) << "StapA packet with incorrect NALU packet lengths.";
90       return absl::nullopt;
91     }
92 
93     h264_header.packetization_type = kH264StapA;
94     nal_type = payload_data[kStapAHeaderSize] & kH264TypeMask;
95   } else {
96     h264_header.packetization_type = kH264SingleNalu;
97     nalu_start_offsets.push_back(0);
98   }
99   h264_header.nalu_type = nal_type;
100   parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameDelta;
101 
102   nalu_start_offsets.push_back(rtp_payload.size() +
103                                kLengthFieldSize);  // End offset.
104   for (size_t i = 0; i < nalu_start_offsets.size() - 1; ++i) {
105     size_t start_offset = nalu_start_offsets[i];
106     // End offset is actually start offset for next unit, excluding length field
107     // so remove that from this units length.
108     size_t end_offset = nalu_start_offsets[i + 1] - kLengthFieldSize;
109     if (end_offset - start_offset < H264::kNaluTypeSize) {
110       RTC_LOG(LS_ERROR) << "STAP-A packet too short";
111       return absl::nullopt;
112     }
113 
114     NaluInfo nalu;
115     nalu.type = payload_data[start_offset] & kH264TypeMask;
116     nalu.sps_id = -1;
117     nalu.pps_id = -1;
118     start_offset += H264::kNaluTypeSize;
119 
120     switch (nalu.type) {
121       case H264::NaluType::kSps: {
122         // Check if VUI is present in SPS and if it needs to be modified to
123         // avoid
124         // excessive decoder latency.
125 
126         // Copy any previous data first (likely just the first header).
127         rtc::Buffer output_buffer;
128         if (start_offset)
129           output_buffer.AppendData(payload_data, start_offset);
130 
131         absl::optional<SpsParser::SpsState> sps;
132 
133         SpsVuiRewriter::ParseResult result = SpsVuiRewriter::ParseAndRewriteSps(
134             &payload_data[start_offset], end_offset - start_offset, &sps,
135             nullptr, &output_buffer, SpsVuiRewriter::Direction::kIncoming);
136 
137         if (result == SpsVuiRewriter::ParseResult::kVuiRewritten) {
138           if (modified_buffer) {
139             RTC_LOG(LS_WARNING)
140                 << "More than one H264 SPS NAL units needing "
141                    "rewriting found within a single STAP-A packet. "
142                    "Keeping the first and rewriting the last.";
143           }
144 
145           // Rewrite length field to new SPS size.
146           if (h264_header.packetization_type == kH264StapA) {
147             size_t length_field_offset =
148                 start_offset - (H264::kNaluTypeSize + kLengthFieldSize);
149             // Stap-A Length includes payload data and type header.
150             size_t rewritten_size =
151                 output_buffer.size() - start_offset + H264::kNaluTypeSize;
152             ByteWriter<uint16_t>::WriteBigEndian(
153                 &output_buffer[length_field_offset], rewritten_size);
154           }
155 
156           parsed_payload->video_payload.SetData(output_buffer.data(),
157                                                 output_buffer.size());
158           // Append rest of packet.
159           parsed_payload->video_payload.AppendData(
160               &payload_data[end_offset],
161               nalu_length + kNalHeaderSize - end_offset);
162 
163           modified_buffer = true;
164         }
165 
166         if (sps) {
167           parsed_payload->video_header.width = sps->width;
168           parsed_payload->video_header.height = sps->height;
169           nalu.sps_id = sps->id;
170         } else {
171           RTC_LOG(LS_WARNING) << "Failed to parse SPS id from SPS slice.";
172         }
173         parsed_payload->video_header.frame_type =
174             VideoFrameType::kVideoFrameKey;
175         break;
176       }
177       case H264::NaluType::kPps: {
178         uint32_t pps_id;
179         uint32_t sps_id;
180         if (PpsParser::ParsePpsIds(&payload_data[start_offset],
181                                    end_offset - start_offset, &pps_id,
182                                    &sps_id)) {
183           nalu.pps_id = pps_id;
184           nalu.sps_id = sps_id;
185         } else {
186           RTC_LOG(LS_WARNING)
187               << "Failed to parse PPS id and SPS id from PPS slice.";
188         }
189         break;
190       }
191       case H264::NaluType::kIdr:
192         parsed_payload->video_header.frame_type =
193             VideoFrameType::kVideoFrameKey;
194         [[fallthrough]];
195       case H264::NaluType::kSlice: {
196         absl::optional<uint32_t> pps_id = PpsParser::ParsePpsIdFromSlice(
197             &payload_data[start_offset], end_offset - start_offset);
198         if (pps_id) {
199           nalu.pps_id = *pps_id;
200         } else {
201           RTC_LOG(LS_WARNING) << "Failed to parse PPS id from slice of type: "
202                               << static_cast<int>(nalu.type);
203         }
204         break;
205       }
206       // Slices below don't contain SPS or PPS ids.
207       case H264::NaluType::kAud:
208       case H264::NaluType::kEndOfSequence:
209       case H264::NaluType::kEndOfStream:
210       case H264::NaluType::kFiller:
211       case H264::NaluType::kSei:
212         break;
213       case H264::NaluType::kStapA:
214       case H264::NaluType::kFuA:
215         RTC_LOG(LS_WARNING) << "Unexpected STAP-A or FU-A received.";
216         return absl::nullopt;
217     }
218 
219     if (h264_header.nalus_length == kMaxNalusPerPacket) {
220       RTC_LOG(LS_WARNING)
221           << "Received packet containing more than " << kMaxNalusPerPacket
222           << " NAL units. Will not keep track sps and pps ids for all of them.";
223     } else {
224       h264_header.nalus[h264_header.nalus_length++] = nalu;
225     }
226   }
227 
228   return parsed_payload;
229 }
230 
ParseFuaNalu(rtc::CopyOnWriteBuffer rtp_payload)231 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> ParseFuaNalu(
232     rtc::CopyOnWriteBuffer rtp_payload) {
233   if (rtp_payload.size() < kFuAHeaderSize) {
234     RTC_LOG(LS_ERROR) << "FU-A NAL units truncated.";
235     return absl::nullopt;
236   }
237   absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload(
238       absl::in_place);
239   uint8_t fnri = rtp_payload.cdata()[0] & (kH264FBit | kH264NriMask);
240   uint8_t original_nal_type = rtp_payload.cdata()[1] & kH264TypeMask;
241   bool first_fragment = (rtp_payload.cdata()[1] & kH264SBit) > 0;
242   NaluInfo nalu;
243   nalu.type = original_nal_type;
244   nalu.sps_id = -1;
245   nalu.pps_id = -1;
246   if (first_fragment) {
247     absl::optional<uint32_t> pps_id =
248         PpsParser::ParsePpsIdFromSlice(rtp_payload.cdata() + 2 * kNalHeaderSize,
249                                        rtp_payload.size() - 2 * kNalHeaderSize);
250     if (pps_id) {
251       nalu.pps_id = *pps_id;
252     } else {
253       RTC_LOG(LS_WARNING)
254           << "Failed to parse PPS from first fragment of FU-A NAL "
255              "unit with original type: "
256           << static_cast<int>(nalu.type);
257     }
258     uint8_t original_nal_header = fnri | original_nal_type;
259     rtp_payload =
260         rtp_payload.Slice(kNalHeaderSize, rtp_payload.size() - kNalHeaderSize);
261     rtp_payload.MutableData()[0] = original_nal_header;
262     parsed_payload->video_payload = std::move(rtp_payload);
263   } else {
264     parsed_payload->video_payload =
265         rtp_payload.Slice(kFuAHeaderSize, rtp_payload.size() - kFuAHeaderSize);
266   }
267 
268   if (original_nal_type == H264::NaluType::kIdr) {
269     parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameKey;
270   } else {
271     parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameDelta;
272   }
273   parsed_payload->video_header.width = 0;
274   parsed_payload->video_header.height = 0;
275   parsed_payload->video_header.codec = kVideoCodecH264;
276   parsed_payload->video_header.simulcastIdx = 0;
277   parsed_payload->video_header.is_first_packet_in_frame = first_fragment;
278   auto& h264_header = parsed_payload->video_header.video_type_header
279                           .emplace<RTPVideoHeaderH264>();
280   h264_header.packetization_type = kH264FuA;
281   h264_header.nalu_type = original_nal_type;
282   if (first_fragment) {
283     h264_header.nalus[h264_header.nalus_length] = nalu;
284     h264_header.nalus_length = 1;
285   }
286   return parsed_payload;
287 }
288 
289 }  // namespace
290 
291 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload>
Parse(rtc::CopyOnWriteBuffer rtp_payload)292 VideoRtpDepacketizerH264::Parse(rtc::CopyOnWriteBuffer rtp_payload) {
293   if (rtp_payload.size() == 0) {
294     RTC_LOG(LS_ERROR) << "Empty payload.";
295     return absl::nullopt;
296   }
297 
298   uint8_t nal_type = rtp_payload.cdata()[0] & kH264TypeMask;
299 
300   if (nal_type == H264::NaluType::kFuA) {
301     // Fragmented NAL units (FU-A).
302     return ParseFuaNalu(std::move(rtp_payload));
303   } else {
304     // We handle STAP-A and single NALU's the same way here. The jitter buffer
305     // will depacketize the STAP-A into NAL units later.
306     return ProcessStapAOrSingleNalu(std::move(rtp_payload));
307   }
308 }
309 
310 }  // namespace webrtc
311