1 /*
2 * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_h264.h"
12
13 #include <cstddef>
14 #include <cstdint>
15 #include <utility>
16 #include <vector>
17
18 #include "absl/types/optional.h"
19 #include "absl/types/variant.h"
20 #include "common_video/h264/h264_common.h"
21 #include "common_video/h264/pps_parser.h"
22 #include "common_video/h264/sps_parser.h"
23 #include "common_video/h264/sps_vui_rewriter.h"
24 #include "modules/rtp_rtcp/source/byte_io.h"
25 #include "modules/rtp_rtcp/source/rtp_format_h264.h"
26 #include "modules/rtp_rtcp/source/video_rtp_depacketizer.h"
27 #include "rtc_base/checks.h"
28 #include "rtc_base/copy_on_write_buffer.h"
29 #include "rtc_base/logging.h"
30
31 namespace webrtc {
32 namespace {
33
34 constexpr size_t kNalHeaderSize = 1;
35 constexpr size_t kFuAHeaderSize = 2;
36 constexpr size_t kLengthFieldSize = 2;
37 constexpr size_t kStapAHeaderSize = kNalHeaderSize + kLengthFieldSize;
38
39 // TODO(pbos): Avoid parsing this here as well as inside the jitter buffer.
ParseStapAStartOffsets(const uint8_t * nalu_ptr,size_t length_remaining,std::vector<size_t> * offsets)40 bool ParseStapAStartOffsets(const uint8_t* nalu_ptr,
41 size_t length_remaining,
42 std::vector<size_t>* offsets) {
43 size_t offset = 0;
44 while (length_remaining > 0) {
45 // Buffer doesn't contain room for additional nalu length.
46 if (length_remaining < sizeof(uint16_t))
47 return false;
48 uint16_t nalu_size = ByteReader<uint16_t>::ReadBigEndian(nalu_ptr);
49 nalu_ptr += sizeof(uint16_t);
50 length_remaining -= sizeof(uint16_t);
51 if (nalu_size > length_remaining)
52 return false;
53 nalu_ptr += nalu_size;
54 length_remaining -= nalu_size;
55
56 offsets->push_back(offset + kStapAHeaderSize);
57 offset += kLengthFieldSize + nalu_size;
58 }
59 return true;
60 }
61
ProcessStapAOrSingleNalu(rtc::CopyOnWriteBuffer rtp_payload)62 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> ProcessStapAOrSingleNalu(
63 rtc::CopyOnWriteBuffer rtp_payload) {
64 const uint8_t* const payload_data = rtp_payload.cdata();
65 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload(
66 absl::in_place);
67 bool modified_buffer = false;
68 parsed_payload->video_payload = rtp_payload;
69 parsed_payload->video_header.width = 0;
70 parsed_payload->video_header.height = 0;
71 parsed_payload->video_header.codec = kVideoCodecH264;
72 parsed_payload->video_header.simulcastIdx = 0;
73 parsed_payload->video_header.is_first_packet_in_frame = true;
74 auto& h264_header = parsed_payload->video_header.video_type_header
75 .emplace<RTPVideoHeaderH264>();
76
77 const uint8_t* nalu_start = payload_data + kNalHeaderSize;
78 const size_t nalu_length = rtp_payload.size() - kNalHeaderSize;
79 uint8_t nal_type = payload_data[0] & kH264TypeMask;
80 std::vector<size_t> nalu_start_offsets;
81 if (nal_type == H264::NaluType::kStapA) {
82 // Skip the StapA header (StapA NAL type + length).
83 if (rtp_payload.size() <= kStapAHeaderSize) {
84 RTC_LOG(LS_ERROR) << "StapA header truncated.";
85 return absl::nullopt;
86 }
87
88 if (!ParseStapAStartOffsets(nalu_start, nalu_length, &nalu_start_offsets)) {
89 RTC_LOG(LS_ERROR) << "StapA packet with incorrect NALU packet lengths.";
90 return absl::nullopt;
91 }
92
93 h264_header.packetization_type = kH264StapA;
94 nal_type = payload_data[kStapAHeaderSize] & kH264TypeMask;
95 } else {
96 h264_header.packetization_type = kH264SingleNalu;
97 nalu_start_offsets.push_back(0);
98 }
99 h264_header.nalu_type = nal_type;
100 parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameDelta;
101
102 nalu_start_offsets.push_back(rtp_payload.size() +
103 kLengthFieldSize); // End offset.
104 for (size_t i = 0; i < nalu_start_offsets.size() - 1; ++i) {
105 size_t start_offset = nalu_start_offsets[i];
106 // End offset is actually start offset for next unit, excluding length field
107 // so remove that from this units length.
108 size_t end_offset = nalu_start_offsets[i + 1] - kLengthFieldSize;
109 if (end_offset - start_offset < H264::kNaluTypeSize) {
110 RTC_LOG(LS_ERROR) << "STAP-A packet too short";
111 return absl::nullopt;
112 }
113
114 NaluInfo nalu;
115 nalu.type = payload_data[start_offset] & kH264TypeMask;
116 nalu.sps_id = -1;
117 nalu.pps_id = -1;
118 start_offset += H264::kNaluTypeSize;
119
120 switch (nalu.type) {
121 case H264::NaluType::kSps: {
122 // Check if VUI is present in SPS and if it needs to be modified to
123 // avoid
124 // excessive decoder latency.
125
126 // Copy any previous data first (likely just the first header).
127 rtc::Buffer output_buffer;
128 if (start_offset)
129 output_buffer.AppendData(payload_data, start_offset);
130
131 absl::optional<SpsParser::SpsState> sps;
132
133 SpsVuiRewriter::ParseResult result = SpsVuiRewriter::ParseAndRewriteSps(
134 &payload_data[start_offset], end_offset - start_offset, &sps,
135 nullptr, &output_buffer, SpsVuiRewriter::Direction::kIncoming);
136
137 if (result == SpsVuiRewriter::ParseResult::kVuiRewritten) {
138 if (modified_buffer) {
139 RTC_LOG(LS_WARNING)
140 << "More than one H264 SPS NAL units needing "
141 "rewriting found within a single STAP-A packet. "
142 "Keeping the first and rewriting the last.";
143 }
144
145 // Rewrite length field to new SPS size.
146 if (h264_header.packetization_type == kH264StapA) {
147 size_t length_field_offset =
148 start_offset - (H264::kNaluTypeSize + kLengthFieldSize);
149 // Stap-A Length includes payload data and type header.
150 size_t rewritten_size =
151 output_buffer.size() - start_offset + H264::kNaluTypeSize;
152 ByteWriter<uint16_t>::WriteBigEndian(
153 &output_buffer[length_field_offset], rewritten_size);
154 }
155
156 parsed_payload->video_payload.SetData(output_buffer.data(),
157 output_buffer.size());
158 // Append rest of packet.
159 parsed_payload->video_payload.AppendData(
160 &payload_data[end_offset],
161 nalu_length + kNalHeaderSize - end_offset);
162
163 modified_buffer = true;
164 }
165
166 if (sps) {
167 parsed_payload->video_header.width = sps->width;
168 parsed_payload->video_header.height = sps->height;
169 nalu.sps_id = sps->id;
170 } else {
171 RTC_LOG(LS_WARNING) << "Failed to parse SPS id from SPS slice.";
172 }
173 parsed_payload->video_header.frame_type =
174 VideoFrameType::kVideoFrameKey;
175 break;
176 }
177 case H264::NaluType::kPps: {
178 uint32_t pps_id;
179 uint32_t sps_id;
180 if (PpsParser::ParsePpsIds(&payload_data[start_offset],
181 end_offset - start_offset, &pps_id,
182 &sps_id)) {
183 nalu.pps_id = pps_id;
184 nalu.sps_id = sps_id;
185 } else {
186 RTC_LOG(LS_WARNING)
187 << "Failed to parse PPS id and SPS id from PPS slice.";
188 }
189 break;
190 }
191 case H264::NaluType::kIdr:
192 parsed_payload->video_header.frame_type =
193 VideoFrameType::kVideoFrameKey;
194 [[fallthrough]];
195 case H264::NaluType::kSlice: {
196 absl::optional<uint32_t> pps_id = PpsParser::ParsePpsIdFromSlice(
197 &payload_data[start_offset], end_offset - start_offset);
198 if (pps_id) {
199 nalu.pps_id = *pps_id;
200 } else {
201 RTC_LOG(LS_WARNING) << "Failed to parse PPS id from slice of type: "
202 << static_cast<int>(nalu.type);
203 }
204 break;
205 }
206 // Slices below don't contain SPS or PPS ids.
207 case H264::NaluType::kAud:
208 case H264::NaluType::kEndOfSequence:
209 case H264::NaluType::kEndOfStream:
210 case H264::NaluType::kFiller:
211 case H264::NaluType::kSei:
212 break;
213 case H264::NaluType::kStapA:
214 case H264::NaluType::kFuA:
215 RTC_LOG(LS_WARNING) << "Unexpected STAP-A or FU-A received.";
216 return absl::nullopt;
217 }
218
219 if (h264_header.nalus_length == kMaxNalusPerPacket) {
220 RTC_LOG(LS_WARNING)
221 << "Received packet containing more than " << kMaxNalusPerPacket
222 << " NAL units. Will not keep track sps and pps ids for all of them.";
223 } else {
224 h264_header.nalus[h264_header.nalus_length++] = nalu;
225 }
226 }
227
228 return parsed_payload;
229 }
230
ParseFuaNalu(rtc::CopyOnWriteBuffer rtp_payload)231 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> ParseFuaNalu(
232 rtc::CopyOnWriteBuffer rtp_payload) {
233 if (rtp_payload.size() < kFuAHeaderSize) {
234 RTC_LOG(LS_ERROR) << "FU-A NAL units truncated.";
235 return absl::nullopt;
236 }
237 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload(
238 absl::in_place);
239 uint8_t fnri = rtp_payload.cdata()[0] & (kH264FBit | kH264NriMask);
240 uint8_t original_nal_type = rtp_payload.cdata()[1] & kH264TypeMask;
241 bool first_fragment = (rtp_payload.cdata()[1] & kH264SBit) > 0;
242 NaluInfo nalu;
243 nalu.type = original_nal_type;
244 nalu.sps_id = -1;
245 nalu.pps_id = -1;
246 if (first_fragment) {
247 absl::optional<uint32_t> pps_id =
248 PpsParser::ParsePpsIdFromSlice(rtp_payload.cdata() + 2 * kNalHeaderSize,
249 rtp_payload.size() - 2 * kNalHeaderSize);
250 if (pps_id) {
251 nalu.pps_id = *pps_id;
252 } else {
253 RTC_LOG(LS_WARNING)
254 << "Failed to parse PPS from first fragment of FU-A NAL "
255 "unit with original type: "
256 << static_cast<int>(nalu.type);
257 }
258 uint8_t original_nal_header = fnri | original_nal_type;
259 rtp_payload =
260 rtp_payload.Slice(kNalHeaderSize, rtp_payload.size() - kNalHeaderSize);
261 rtp_payload.MutableData()[0] = original_nal_header;
262 parsed_payload->video_payload = std::move(rtp_payload);
263 } else {
264 parsed_payload->video_payload =
265 rtp_payload.Slice(kFuAHeaderSize, rtp_payload.size() - kFuAHeaderSize);
266 }
267
268 if (original_nal_type == H264::NaluType::kIdr) {
269 parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameKey;
270 } else {
271 parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameDelta;
272 }
273 parsed_payload->video_header.width = 0;
274 parsed_payload->video_header.height = 0;
275 parsed_payload->video_header.codec = kVideoCodecH264;
276 parsed_payload->video_header.simulcastIdx = 0;
277 parsed_payload->video_header.is_first_packet_in_frame = first_fragment;
278 auto& h264_header = parsed_payload->video_header.video_type_header
279 .emplace<RTPVideoHeaderH264>();
280 h264_header.packetization_type = kH264FuA;
281 h264_header.nalu_type = original_nal_type;
282 if (first_fragment) {
283 h264_header.nalus[h264_header.nalus_length] = nalu;
284 h264_header.nalus_length = 1;
285 }
286 return parsed_payload;
287 }
288
289 } // namespace
290
291 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload>
Parse(rtc::CopyOnWriteBuffer rtp_payload)292 VideoRtpDepacketizerH264::Parse(rtc::CopyOnWriteBuffer rtp_payload) {
293 if (rtp_payload.size() == 0) {
294 RTC_LOG(LS_ERROR) << "Empty payload.";
295 return absl::nullopt;
296 }
297
298 uint8_t nal_type = rtp_payload.cdata()[0] & kH264TypeMask;
299
300 if (nal_type == H264::NaluType::kFuA) {
301 // Fragmented NAL units (FU-A).
302 return ParseFuaNalu(std::move(rtp_payload));
303 } else {
304 // We handle STAP-A and single NALU's the same way here. The jitter buffer
305 // will depacketize the STAP-A into NAL units later.
306 return ProcessStapAOrSingleNalu(std::move(rtp_payload));
307 }
308 }
309
310 } // namespace webrtc
311