xref: /aosp_15_r20/external/libopus/doc/draft-ietf-payload-rtp-opus.xml (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li<?xml version="1.0" encoding="UTF-8"?>
2*a58d3d2aSXin Li<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
3*a58d3d2aSXin Li<!ENTITY rfc2119 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml'>
4*a58d3d2aSXin Li<!ENTITY rfc3389 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3389.xml'>
5*a58d3d2aSXin Li<!ENTITY rfc3550 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml'>
6*a58d3d2aSXin Li<!ENTITY rfc3711 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3711.xml'>
7*a58d3d2aSXin Li<!ENTITY rfc3551 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3551.xml'>
8*a58d3d2aSXin Li<!ENTITY rfc6838 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6838.xml'>
9*a58d3d2aSXin Li<!ENTITY rfc4855 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4855.xml'>
10*a58d3d2aSXin Li<!ENTITY rfc4566 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4566.xml'>
11*a58d3d2aSXin Li<!ENTITY rfc4585 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4585.xml'>
12*a58d3d2aSXin Li<!ENTITY rfc3264 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3264.xml'>
13*a58d3d2aSXin Li<!ENTITY rfc2974 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2974.xml'>
14*a58d3d2aSXin Li<!ENTITY rfc2326 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2326.xml'>
15*a58d3d2aSXin Li<!ENTITY rfc3555 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3555.xml'>
16*a58d3d2aSXin Li<!ENTITY rfc5124 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5124.xml'>
17*a58d3d2aSXin Li<!ENTITY rfc5405 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5405.xml'>
18*a58d3d2aSXin Li<!ENTITY rfc5576 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5576.xml'>
19*a58d3d2aSXin Li<!ENTITY rfc6562 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6562.xml'>
20*a58d3d2aSXin Li<!ENTITY rfc6716 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml'>
21*a58d3d2aSXin Li<!ENTITY rfc7202 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.7202.xml'>
22*a58d3d2aSXin Li<!ENTITY nbsp "&#160;">
23*a58d3d2aSXin Li  ]>
24*a58d3d2aSXin Li
25*a58d3d2aSXin Li  <rfc category="std" ipr="trust200902" docName="draft-ietf-payload-rtp-opus-11">
26*a58d3d2aSXin Li<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
27*a58d3d2aSXin Li
28*a58d3d2aSXin Li<?rfc strict="yes" ?>
29*a58d3d2aSXin Li<?rfc toc="yes" ?>
30*a58d3d2aSXin Li<?rfc tocdepth="3" ?>
31*a58d3d2aSXin Li<?rfc tocappendix='no' ?>
32*a58d3d2aSXin Li<?rfc tocindent='yes' ?>
33*a58d3d2aSXin Li<?rfc symrefs="yes" ?>
34*a58d3d2aSXin Li<?rfc sortrefs="yes" ?>
35*a58d3d2aSXin Li<?rfc compact="no" ?>
36*a58d3d2aSXin Li<?rfc subcompact="yes" ?>
37*a58d3d2aSXin Li<?rfc iprnotified="yes" ?>
38*a58d3d2aSXin Li
39*a58d3d2aSXin Li  <front>
40*a58d3d2aSXin Li    <title abbrev="RTP Payload Format for Opus">
41*a58d3d2aSXin Li      RTP Payload Format for the Opus Speech and Audio Codec
42*a58d3d2aSXin Li    </title>
43*a58d3d2aSXin Li
44*a58d3d2aSXin Li    <author fullname="Julian Spittka" initials="J." surname="Spittka">
45*a58d3d2aSXin Li      <address>
46*a58d3d2aSXin Li        <email>[email protected]</email>
47*a58d3d2aSXin Li      </address>
48*a58d3d2aSXin Li    </author>
49*a58d3d2aSXin Li
50*a58d3d2aSXin Li    <author initials='K.' surname='Vos' fullname='Koen Vos'>
51*a58d3d2aSXin Li      <organization>vocTone</organization>
52*a58d3d2aSXin Li      <address>
53*a58d3d2aSXin Li        <postal>
54*a58d3d2aSXin Li          <street></street>
55*a58d3d2aSXin Li          <code></code>
56*a58d3d2aSXin Li          <city></city>
57*a58d3d2aSXin Li          <region></region>
58*a58d3d2aSXin Li          <country></country>
59*a58d3d2aSXin Li        </postal>
60*a58d3d2aSXin Li        <email>[email protected]</email>
61*a58d3d2aSXin Li      </address>
62*a58d3d2aSXin Li    </author>
63*a58d3d2aSXin Li
64*a58d3d2aSXin Li    <author initials="JM" surname="Valin" fullname="Jean-Marc Valin">
65*a58d3d2aSXin Li      <organization>Mozilla</organization>
66*a58d3d2aSXin Li      <address>
67*a58d3d2aSXin Li        <postal>
68*a58d3d2aSXin Li          <street>331 E. Evelyn Avenue</street>
69*a58d3d2aSXin Li          <city>Mountain View</city>
70*a58d3d2aSXin Li          <region>CA</region>
71*a58d3d2aSXin Li          <code>94041</code>
72*a58d3d2aSXin Li          <country>USA</country>
73*a58d3d2aSXin Li        </postal>
74*a58d3d2aSXin Li        <email>[email protected]</email>
75*a58d3d2aSXin Li      </address>
76*a58d3d2aSXin Li    </author>
77*a58d3d2aSXin Li
78*a58d3d2aSXin Li    <date day='14' month='April' year='2015' />
79*a58d3d2aSXin Li
80*a58d3d2aSXin Li    <abstract>
81*a58d3d2aSXin Li      <t>
82*a58d3d2aSXin Li        This document defines the Real-time Transport Protocol (RTP) payload
83*a58d3d2aSXin Li        format for packetization of Opus encoded
84*a58d3d2aSXin Li        speech and audio data necessary to integrate the codec in the
85*a58d3d2aSXin Li        most compatible way. It also provides an applicability statement
86*a58d3d2aSXin Li        for the use of Opus over RTP. Further, it describes media type registrations
87*a58d3d2aSXin Li        for the RTP payload format.
88*a58d3d2aSXin Li      </t>
89*a58d3d2aSXin Li    </abstract>
90*a58d3d2aSXin Li  </front>
91*a58d3d2aSXin Li
92*a58d3d2aSXin Li  <middle>
93*a58d3d2aSXin Li    <section title='Introduction'>
94*a58d3d2aSXin Li      <t>
95*a58d3d2aSXin Li        Opus <xref target="RFC6716"/> is a speech and audio codec developed within the
96*a58d3d2aSXin Li        IETF Internet Wideband Audio Codec working group. The codec
97*a58d3d2aSXin Li        has a very low algorithmic delay and it
98*a58d3d2aSXin Li        is highly scalable in terms of audio bandwidth, bitrate, and
99*a58d3d2aSXin Li        complexity. Further, it provides different modes to efficiently encode speech signals
100*a58d3d2aSXin Li        as well as music signals, thus making it the codec of choice for
101*a58d3d2aSXin Li        various applications using the Internet or similar networks.
102*a58d3d2aSXin Li      </t>
103*a58d3d2aSXin Li      <t>
104*a58d3d2aSXin Li        This document defines the Real-time Transport Protocol (RTP)
105*a58d3d2aSXin Li        <xref target="RFC3550"/> payload format for packetization
106*a58d3d2aSXin Li        of Opus encoded speech and audio data necessary to
107*a58d3d2aSXin Li        integrate Opus in the
108*a58d3d2aSXin Li        most compatible way. It also provides an applicability statement
109*a58d3d2aSXin Li        for the use of Opus over RTP.
110*a58d3d2aSXin Li        Further, it describes media type registrations for
111*a58d3d2aSXin Li        the RTP payload format.
112*a58d3d2aSXin Li      </t>
113*a58d3d2aSXin Li    </section>
114*a58d3d2aSXin Li
115*a58d3d2aSXin Li    <section title='Conventions, Definitions and Acronyms used in this document'>
116*a58d3d2aSXin Li      <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
117*a58d3d2aSXin Li      "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
118*a58d3d2aSXin Li      document are to be interpreted as described in <xref target="RFC2119"/>.</t>
119*a58d3d2aSXin Li      <t>
120*a58d3d2aSXin Li      <list style='hanging'>
121*a58d3d2aSXin Li          <t hangText="audio bandwidth:"> The range of audio frequecies being coded</t>
122*a58d3d2aSXin Li          <t hangText="CBR:"> Constant bitrate</t>
123*a58d3d2aSXin Li          <t hangText="CPU:"> Central Processing Unit</t>
124*a58d3d2aSXin Li          <t hangText="DTX:"> Discontinuous transmission</t>
125*a58d3d2aSXin Li          <t hangText="FEC:"> Forward error correction</t>
126*a58d3d2aSXin Li          <t hangText="IP:"> Internet Protocol</t>
127*a58d3d2aSXin Li          <t hangText="samples:"> Speech or audio samples (per channel)</t>
128*a58d3d2aSXin Li          <t hangText="SDP:"> Session Description Protocol</t>
129*a58d3d2aSXin Li          <t hangText="VBR:"> Variable bitrate</t>
130*a58d3d2aSXin Li      </list>
131*a58d3d2aSXin Li      </t>
132*a58d3d2aSXin Li        <t>
133*a58d3d2aSXin Li          Throughout this document, we refer to the following definitions:
134*a58d3d2aSXin Li        </t>
135*a58d3d2aSXin Li          <texttable anchor='bandwidth_definitions'>
136*a58d3d2aSXin Li            <ttcol align='center'>Abbreviation</ttcol>
137*a58d3d2aSXin Li            <ttcol align='center'>Name</ttcol>
138*a58d3d2aSXin Li            <ttcol align='center'>Audio Bandwidth (Hz)</ttcol>
139*a58d3d2aSXin Li            <ttcol align='center'>Sampling Rate (Hz)</ttcol>
140*a58d3d2aSXin Li            <c>NB</c>
141*a58d3d2aSXin Li            <c>Narrowband</c>
142*a58d3d2aSXin Li            <c>0 - 4000</c>
143*a58d3d2aSXin Li            <c>8000</c>
144*a58d3d2aSXin Li
145*a58d3d2aSXin Li            <c>MB</c>
146*a58d3d2aSXin Li            <c>Mediumband</c>
147*a58d3d2aSXin Li            <c>0 - 6000</c>
148*a58d3d2aSXin Li            <c>12000</c>
149*a58d3d2aSXin Li
150*a58d3d2aSXin Li            <c>WB</c>
151*a58d3d2aSXin Li            <c>Wideband</c>
152*a58d3d2aSXin Li            <c>0 - 8000</c>
153*a58d3d2aSXin Li            <c>16000</c>
154*a58d3d2aSXin Li
155*a58d3d2aSXin Li            <c>SWB</c>
156*a58d3d2aSXin Li            <c>Super-wideband</c>
157*a58d3d2aSXin Li            <c>0 - 12000</c>
158*a58d3d2aSXin Li            <c>24000</c>
159*a58d3d2aSXin Li
160*a58d3d2aSXin Li            <c>FB</c>
161*a58d3d2aSXin Li            <c>Fullband</c>
162*a58d3d2aSXin Li            <c>0 - 20000</c>
163*a58d3d2aSXin Li            <c>48000</c>
164*a58d3d2aSXin Li
165*a58d3d2aSXin Li            <postamble>
166*a58d3d2aSXin Li              Audio bandwidth naming
167*a58d3d2aSXin Li            </postamble>
168*a58d3d2aSXin Li          </texttable>
169*a58d3d2aSXin Li    </section>
170*a58d3d2aSXin Li
171*a58d3d2aSXin Li    <section title='Opus Codec'>
172*a58d3d2aSXin Li      <t>
173*a58d3d2aSXin Li        Opus encodes speech
174*a58d3d2aSXin Li        signals as well as general audio signals. Two different modes can be
175*a58d3d2aSXin Li        chosen, a voice mode or an audio mode, to allow the most efficient coding
176*a58d3d2aSXin Li        depending on the type of the input signal, the sampling frequency of the
177*a58d3d2aSXin Li        input signal, and the intended application.
178*a58d3d2aSXin Li      </t>
179*a58d3d2aSXin Li
180*a58d3d2aSXin Li      <t>
181*a58d3d2aSXin Li        The voice mode allows efficient encoding of voice signals at lower bit
182*a58d3d2aSXin Li        rates while the audio mode is optimized for general audio signals at medium and
183*a58d3d2aSXin Li        higher bitrates.
184*a58d3d2aSXin Li      </t>
185*a58d3d2aSXin Li
186*a58d3d2aSXin Li      <t>
187*a58d3d2aSXin Li        Opus is highly scalable in terms of audio
188*a58d3d2aSXin Li        bandwidth, bitrate, and complexity. Further, Opus allows
189*a58d3d2aSXin Li        transmitting stereo signals with in-band signaling in the bit-stream.
190*a58d3d2aSXin Li      </t>
191*a58d3d2aSXin Li
192*a58d3d2aSXin Li      <section title='Network Bandwidth'>
193*a58d3d2aSXin Li          <t>
194*a58d3d2aSXin Li            Opus supports bitrates from 6&nbsp;kb/s to 510&nbsp;kb/s.
195*a58d3d2aSXin Li            The bitrate can be changed dynamically within that range.
196*a58d3d2aSXin Li            All
197*a58d3d2aSXin Li            other parameters being
198*a58d3d2aSXin Li            equal, higher bitrates result in higher audio quality.
199*a58d3d2aSXin Li          </t>
200*a58d3d2aSXin Li          <section title='Recommended Bitrate' anchor='bitrate_by_bandwidth'>
201*a58d3d2aSXin Li          <t>
202*a58d3d2aSXin Li            For a frame size of
203*a58d3d2aSXin Li            20&nbsp;ms, these
204*a58d3d2aSXin Li            are the bitrate "sweet spots" for Opus in various configurations:
205*a58d3d2aSXin Li
206*a58d3d2aSXin Li          <list style="symbols">
207*a58d3d2aSXin Li            <t>8-12 kb/s for NB speech,</t>
208*a58d3d2aSXin Li            <t>16-20 kb/s for WB speech,</t>
209*a58d3d2aSXin Li            <t>28-40 kb/s for FB speech,</t>
210*a58d3d2aSXin Li            <t>48-64 kb/s for FB mono music, and</t>
211*a58d3d2aSXin Li            <t>64-128 kb/s for FB stereo music.</t>
212*a58d3d2aSXin Li          </list>
213*a58d3d2aSXin Li        </t>
214*a58d3d2aSXin Li      </section>
215*a58d3d2aSXin Li        <section title='Variable versus Constant Bitrate'  anchor='variable-vs-constant-bitrate'>
216*a58d3d2aSXin Li          <t>
217*a58d3d2aSXin Li            For the same average bitrate, variable bitrate (VBR) can achieve higher audio quality
218*a58d3d2aSXin Li            than constant bitrate (CBR). For the majority of voice transmission applications, VBR
219*a58d3d2aSXin Li            is the best choice. One reason for choosing CBR is the potential
220*a58d3d2aSXin Li            information leak that <spanx style='emph'>might</spanx> occur when encrypting the
221*a58d3d2aSXin Li            compressed stream. See <xref target="RFC6562"/> for guidelines on when VBR is
222*a58d3d2aSXin Li            appropriate for encrypted audio communications. In the case where an existing
223*a58d3d2aSXin Li            VBR stream needs to be converted to CBR for security reasons, then the Opus padding
224*a58d3d2aSXin Li            mechanism described in <xref target="RFC6716"/> is the RECOMMENDED way to achieve padding
225*a58d3d2aSXin Li            because the RTP padding bit is unencrypted.</t>
226*a58d3d2aSXin Li
227*a58d3d2aSXin Li          <t>
228*a58d3d2aSXin Li            The bitrate can be adjusted at any point in time. To avoid congestion,
229*a58d3d2aSXin Li            the average bitrate SHOULD NOT exceed the available
230*a58d3d2aSXin Li            network bandwidth. If no target bitrate is specified, the bitrates specified in
231*a58d3d2aSXin Li            <xref target='bitrate_by_bandwidth'/> are RECOMMENDED.
232*a58d3d2aSXin Li          </t>
233*a58d3d2aSXin Li
234*a58d3d2aSXin Li        </section>
235*a58d3d2aSXin Li
236*a58d3d2aSXin Li        <section title='Discontinuous Transmission (DTX)'>
237*a58d3d2aSXin Li
238*a58d3d2aSXin Li          <t>
239*a58d3d2aSXin Li            Opus can, as described in <xref target='variable-vs-constant-bitrate'/>,
240*a58d3d2aSXin Li            be operated with a variable bitrate. In that case, the encoder will
241*a58d3d2aSXin Li            automatically reduce the bitrate for certain input signals, like periods
242*a58d3d2aSXin Li            of silence. When using continuous transmission, it will reduce the
243*a58d3d2aSXin Li            bitrate when the characteristics of the input signal permit, but
244*a58d3d2aSXin Li            will never interrupt the transmission to the receiver. Therefore, the
245*a58d3d2aSXin Li            received signal will maintain the same high level of audio quality over the
246*a58d3d2aSXin Li            full duration of a transmission while minimizing the average bit
247*a58d3d2aSXin Li            rate over time.
248*a58d3d2aSXin Li          </t>
249*a58d3d2aSXin Li
250*a58d3d2aSXin Li          <t>
251*a58d3d2aSXin Li            In cases where the bitrate of Opus needs to be reduced even
252*a58d3d2aSXin Li            further or in cases where only constant bitrate is available,
253*a58d3d2aSXin Li            the Opus encoder can use discontinuous
254*a58d3d2aSXin Li            transmission (DTX), where parts of the encoded signal that
255*a58d3d2aSXin Li            correspond to periods of silence in the input speech or audio signal
256*a58d3d2aSXin Li            are not transmitted to the receiver. A receiver can distinguish
257*a58d3d2aSXin Li            between DTX and packet loss by looking for gaps in the sequence
258*a58d3d2aSXin Li            number, as described by Section 4.1
259*a58d3d2aSXin Li            of&nbsp;<xref target="RFC3551"/>.
260*a58d3d2aSXin Li          </t>
261*a58d3d2aSXin Li
262*a58d3d2aSXin Li          <t>
263*a58d3d2aSXin Li            On the receiving side, the non-transmitted parts will be handled by a
264*a58d3d2aSXin Li            frame loss concealment unit in the Opus decoder which generates a
265*a58d3d2aSXin Li            comfort noise signal to replace the non transmitted parts of the
266*a58d3d2aSXin Li            speech or audio signal. Use of <xref target="RFC3389"/> Comfort
267*a58d3d2aSXin Li            Noise (CN) with Opus is discouraged.
268*a58d3d2aSXin Li            The transmitter MUST drop whole frames only,
269*a58d3d2aSXin Li            based on the size of the last transmitted frame,
270*a58d3d2aSXin Li            to ensure successive RTP timestamps differ by a multiple of 120 and
271*a58d3d2aSXin Li            to allow the receiver to use whole frames for concealment.
272*a58d3d2aSXin Li          </t>
273*a58d3d2aSXin Li
274*a58d3d2aSXin Li          <t>
275*a58d3d2aSXin Li            DTX can be used with both variable and constant bitrate.
276*a58d3d2aSXin Li            It will have a slightly lower speech or audio
277*a58d3d2aSXin Li            quality than continuous transmission. Therefore, using continuous
278*a58d3d2aSXin Li            transmission is RECOMMENDED unless constraints on available network bandwidth
279*a58d3d2aSXin Li            are severe.
280*a58d3d2aSXin Li          </t>
281*a58d3d2aSXin Li
282*a58d3d2aSXin Li        </section>
283*a58d3d2aSXin Li
284*a58d3d2aSXin Li        </section>
285*a58d3d2aSXin Li
286*a58d3d2aSXin Li      <section title='Complexity'>
287*a58d3d2aSXin Li
288*a58d3d2aSXin Li        <t>
289*a58d3d2aSXin Li          Complexity of the encoder can be scaled to optimize for CPU resources in real-time, mostly as
290*a58d3d2aSXin Li          a trade-off between audio quality and bitrate. Also, different modes of Opus have different complexity.
291*a58d3d2aSXin Li        </t>
292*a58d3d2aSXin Li
293*a58d3d2aSXin Li      </section>
294*a58d3d2aSXin Li
295*a58d3d2aSXin Li      <section title="Forward Error Correction (FEC)">
296*a58d3d2aSXin Li
297*a58d3d2aSXin Li        <t>
298*a58d3d2aSXin Li          The voice mode of Opus allows for embedding "in-band" forward error correction (FEC)
299*a58d3d2aSXin Li          data into the Opus bit stream. This FEC scheme adds
300*a58d3d2aSXin Li          redundant information about the previous packet (N-1) to the current
301*a58d3d2aSXin Li          output packet N. For
302*a58d3d2aSXin Li          each frame, the encoder decides whether to use FEC based on (1) an
303*a58d3d2aSXin Li          externally-provided estimate of the channel's packet loss rate; (2) an
304*a58d3d2aSXin Li          externally-provided estimate of the channel's capacity; (3) the
305*a58d3d2aSXin Li          sensitivity of the audio or speech signal to packet loss; (4) whether
306*a58d3d2aSXin Li          the receiving decoder has indicated it can take advantage of "in-band"
307*a58d3d2aSXin Li          FEC information. The decision to send "in-band" FEC information is
308*a58d3d2aSXin Li          entirely controlled by the encoder and therefore no special precautions
309*a58d3d2aSXin Li          for the payload have to be taken.
310*a58d3d2aSXin Li        </t>
311*a58d3d2aSXin Li
312*a58d3d2aSXin Li        <t>
313*a58d3d2aSXin Li          On the receiving side, the decoder can take advantage of this
314*a58d3d2aSXin Li          additional information when it loses a packet and the next packet
315*a58d3d2aSXin Li          is available.  In order to use the FEC data, the jitter buffer needs
316*a58d3d2aSXin Li          to provide access to payloads with the FEC data.
317*a58d3d2aSXin Li          Instead of performing loss concealment for a missing packet, the
318*a58d3d2aSXin Li          receiver can then configure its decoder to decode the FEC data from the next packet.
319*a58d3d2aSXin Li        </t>
320*a58d3d2aSXin Li
321*a58d3d2aSXin Li        <t>
322*a58d3d2aSXin Li          Any compliant Opus decoder is capable of ignoring
323*a58d3d2aSXin Li          FEC information when it is not needed, so encoding with FEC cannot cause
324*a58d3d2aSXin Li          interoperability problems.
325*a58d3d2aSXin Li          However, if FEC cannot be used on the receiving side, then FEC
326*a58d3d2aSXin Li          SHOULD NOT be used, as it leads to an inefficient usage of network
327*a58d3d2aSXin Li          resources. Decoder support for FEC SHOULD be indicated at the time a
328*a58d3d2aSXin Li          session is set up.
329*a58d3d2aSXin Li        </t>
330*a58d3d2aSXin Li
331*a58d3d2aSXin Li      </section>
332*a58d3d2aSXin Li
333*a58d3d2aSXin Li      <section title='Stereo Operation'>
334*a58d3d2aSXin Li
335*a58d3d2aSXin Li        <t>
336*a58d3d2aSXin Li          Opus allows for transmission of stereo audio signals. This operation
337*a58d3d2aSXin Li          is signaled in-band in the Opus bit-stream and no special arrangement
338*a58d3d2aSXin Li          is needed in the payload format. An
339*a58d3d2aSXin Li          Opus decoder is capable of handling a stereo encoding, but an
340*a58d3d2aSXin Li          application might only be capable of consuming a single audio
341*a58d3d2aSXin Li          channel.
342*a58d3d2aSXin Li        </t>
343*a58d3d2aSXin Li        <t>
344*a58d3d2aSXin Li          If a decoder cannot take advantage of the benefits of a stereo signal
345*a58d3d2aSXin Li          this SHOULD be indicated at the time a session is set up. In that case
346*a58d3d2aSXin Li          the sending side SHOULD NOT send stereo signals as it leads to an
347*a58d3d2aSXin Li          inefficient usage of network resources.
348*a58d3d2aSXin Li        </t>
349*a58d3d2aSXin Li
350*a58d3d2aSXin Li      </section>
351*a58d3d2aSXin Li
352*a58d3d2aSXin Li    </section>
353*a58d3d2aSXin Li
354*a58d3d2aSXin Li    <section title='Opus RTP Payload Format' anchor='opus-rtp-payload-format'>
355*a58d3d2aSXin Li      <t>The payload format for Opus consists of the RTP header and Opus payload
356*a58d3d2aSXin Li      data.</t>
357*a58d3d2aSXin Li      <section title='RTP Header Usage'>
358*a58d3d2aSXin Li        <t>The format of the RTP header is specified in <xref target="RFC3550"/>.
359*a58d3d2aSXin Li        The use of the fields of the RTP header by the Opus payload format is
360*a58d3d2aSXin Li        consistent with that specification.</t>
361*a58d3d2aSXin Li
362*a58d3d2aSXin Li        <t>The payload length of Opus is an integer number of octets and
363*a58d3d2aSXin Li        therefore no padding is necessary. The payload MAY be padded by an
364*a58d3d2aSXin Li        integer number of octets according to <xref target="RFC3550"/>,
365*a58d3d2aSXin Li        although the Opus internal padding is preferred.</t>
366*a58d3d2aSXin Li
367*a58d3d2aSXin Li        <t>The timestamp, sequence number, and marker bit (M) of the RTP header
368*a58d3d2aSXin Li        are used in accordance with Section 4.1
369*a58d3d2aSXin Li        of&nbsp;<xref target="RFC3551"/>.</t>
370*a58d3d2aSXin Li
371*a58d3d2aSXin Li        <t>The RTP payload type for Opus is to be assigned dynamically.</t>
372*a58d3d2aSXin Li
373*a58d3d2aSXin Li        <t>The receiving side MUST be prepared to receive duplicate RTP
374*a58d3d2aSXin Li        packets. The receiver MUST provide at most one of those payloads to the
375*a58d3d2aSXin Li        Opus decoder for decoding, and MUST discard the others.</t>
376*a58d3d2aSXin Li
377*a58d3d2aSXin Li        <t>Opus supports 5 different audio bandwidths, which can be adjusted during
378*a58d3d2aSXin Li        a stream.
379*a58d3d2aSXin Li        The RTP timestamp is incremented with a 48000 Hz clock rate
380*a58d3d2aSXin Li        for all modes of Opus and all sampling rates.
381*a58d3d2aSXin Li        The unit
382*a58d3d2aSXin Li        for the timestamp is samples per single (mono) channel. The RTP timestamp corresponds to the
383*a58d3d2aSXin Li        sample time of the first encoded sample in the encoded frame.
384*a58d3d2aSXin Li        For data encoded with sampling rates other than 48000 Hz,
385*a58d3d2aSXin Li	the sampling rate has to be adjusted to 48000 Hz.</t>
386*a58d3d2aSXin Li
387*a58d3d2aSXin Li      </section>
388*a58d3d2aSXin Li
389*a58d3d2aSXin Li      <section title='Payload Structure'>
390*a58d3d2aSXin Li        <t>
391*a58d3d2aSXin Li          The Opus encoder can output encoded frames representing 2.5, 5, 10, 20,
392*a58d3d2aSXin Li          40, or 60&nbsp;ms of speech or audio data. Further, an arbitrary number of frames can be
393*a58d3d2aSXin Li          combined into a packet, up to a maximum packet duration representing
394*a58d3d2aSXin Li          120&nbsp;ms of speech or audio data. The grouping of one or more Opus
395*a58d3d2aSXin Li          frames into a single Opus packet is defined in Section&nbsp;3 of
396*a58d3d2aSXin Li          <xref target="RFC6716"/>. An RTP payload MUST contain exactly one
397*a58d3d2aSXin Li          Opus packet as defined by that document.
398*a58d3d2aSXin Li        </t>
399*a58d3d2aSXin Li
400*a58d3d2aSXin Li        <t><xref target='payload-structure'/> shows the structure combined with the RTP header.</t>
401*a58d3d2aSXin Li
402*a58d3d2aSXin Li        <figure anchor="payload-structure"
403*a58d3d2aSXin Li                title="Packet structure with RTP header">
404*a58d3d2aSXin Li          <artwork align="center">
405*a58d3d2aSXin Li            <![CDATA[
406*a58d3d2aSXin Li+----------+--------------+
407*a58d3d2aSXin Li|RTP Header| Opus Payload |
408*a58d3d2aSXin Li+----------+--------------+
409*a58d3d2aSXin Li           ]]>
410*a58d3d2aSXin Li          </artwork>
411*a58d3d2aSXin Li        </figure>
412*a58d3d2aSXin Li
413*a58d3d2aSXin Li        <t>
414*a58d3d2aSXin Li          <xref target='opus-packetization'/> shows supported frame sizes in
415*a58d3d2aSXin Li          milliseconds of encoded speech or audio data for the speech and audio modes
416*a58d3d2aSXin Li          (Mode) and sampling rates (fs) of Opus and shows how the timestamp is
417*a58d3d2aSXin Li          incremented for packetization (ts incr). If the Opus encoder
418*a58d3d2aSXin Li          outputs multiple encoded frames into a single packet, the timestamp
419*a58d3d2aSXin Li          increment is the sum of the increments for the individual frames.
420*a58d3d2aSXin Li        </t>
421*a58d3d2aSXin Li
422*a58d3d2aSXin Li        <texttable anchor='opus-packetization' title="Supported Opus frame
423*a58d3d2aSXin Li         sizes and timestamp increments marked with an o. Unsupported marked with an x.">
424*a58d3d2aSXin Li            <ttcol align='center'>Mode</ttcol>
425*a58d3d2aSXin Li            <ttcol align='center'>fs</ttcol>
426*a58d3d2aSXin Li            <ttcol align='center'>2.5</ttcol>
427*a58d3d2aSXin Li            <ttcol align='center'>5</ttcol>
428*a58d3d2aSXin Li            <ttcol align='center'>10</ttcol>
429*a58d3d2aSXin Li            <ttcol align='center'>20</ttcol>
430*a58d3d2aSXin Li            <ttcol align='center'>40</ttcol>
431*a58d3d2aSXin Li            <ttcol align='center'>60</ttcol>
432*a58d3d2aSXin Li            <c>ts incr</c>
433*a58d3d2aSXin Li            <c>all</c>
434*a58d3d2aSXin Li            <c>120</c>
435*a58d3d2aSXin Li            <c>240</c>
436*a58d3d2aSXin Li            <c>480</c>
437*a58d3d2aSXin Li            <c>960</c>
438*a58d3d2aSXin Li            <c>1920</c>
439*a58d3d2aSXin Li            <c>2880</c>
440*a58d3d2aSXin Li            <c>voice</c>
441*a58d3d2aSXin Li            <c>NB/MB/WB/SWB/FB</c>
442*a58d3d2aSXin Li            <c>x</c>
443*a58d3d2aSXin Li            <c>x</c>
444*a58d3d2aSXin Li            <c>o</c>
445*a58d3d2aSXin Li            <c>o</c>
446*a58d3d2aSXin Li            <c>o</c>
447*a58d3d2aSXin Li            <c>o</c>
448*a58d3d2aSXin Li            <c>audio</c>
449*a58d3d2aSXin Li            <c>NB/WB/SWB/FB</c>
450*a58d3d2aSXin Li            <c>o</c>
451*a58d3d2aSXin Li            <c>o</c>
452*a58d3d2aSXin Li            <c>o</c>
453*a58d3d2aSXin Li            <c>o</c>
454*a58d3d2aSXin Li            <c>x</c>
455*a58d3d2aSXin Li            <c>x</c>
456*a58d3d2aSXin Li          </texttable>
457*a58d3d2aSXin Li
458*a58d3d2aSXin Li      </section>
459*a58d3d2aSXin Li
460*a58d3d2aSXin Li    </section>
461*a58d3d2aSXin Li
462*a58d3d2aSXin Li    <section title='Congestion Control'>
463*a58d3d2aSXin Li
464*a58d3d2aSXin Li      <t>The target bitrate of Opus can be adjusted at any point in time, thus
465*a58d3d2aSXin Li      allowing efficient congestion control. Furthermore, the amount
466*a58d3d2aSXin Li      of encoded speech or audio data encoded in a
467*a58d3d2aSXin Li      single packet can be used for congestion control, since the transmission
468*a58d3d2aSXin Li      rate is inversely proportional to the packet duration. A lower packet
469*a58d3d2aSXin Li      transmission rate reduces the amount of header overhead, but at the same
470*a58d3d2aSXin Li      time increases latency and loss sensitivity, so it ought to be used with
471*a58d3d2aSXin Li      care.</t>
472*a58d3d2aSXin Li
473*a58d3d2aSXin Li      <t>Since UDP does not provide congestion control, applications that use
474*a58d3d2aSXin Li      RTP over UDP SHOULD implement their own congestion control above the
475*a58d3d2aSXin Li      UDP layer <xref target="RFC5405"/>. Work in the rmcat working group
476*a58d3d2aSXin Li      <xref target="rmcat"/> describes the
477*a58d3d2aSXin Li      interactions and conceptual interfaces necessary between the application
478*a58d3d2aSXin Li      components that relate to congestion control, including the RTP layer,
479*a58d3d2aSXin Li      the higher-level media codec control layer, and the lower-level
480*a58d3d2aSXin Li      transport interface, as well as components dedicated to congestion
481*a58d3d2aSXin Li      control functions.</t>
482*a58d3d2aSXin Li    </section>
483*a58d3d2aSXin Li
484*a58d3d2aSXin Li    <section title='IANA Considerations'>
485*a58d3d2aSXin Li      <t>One media subtype (audio/opus) has been defined and registered as
486*a58d3d2aSXin Li      described in the following section.</t>
487*a58d3d2aSXin Li
488*a58d3d2aSXin Li      <section title='Opus Media Type Registration'>
489*a58d3d2aSXin Li        <t>Media type registration is done according to <xref
490*a58d3d2aSXin Li        target="RFC6838"/> and <xref target="RFC4855"/>.<vspace
491*a58d3d2aSXin Li        blankLines='1'/></t>
492*a58d3d2aSXin Li
493*a58d3d2aSXin Li          <t>Type name: audio<vspace blankLines='1'/></t>
494*a58d3d2aSXin Li          <t>Subtype name: opus<vspace blankLines='1'/></t>
495*a58d3d2aSXin Li
496*a58d3d2aSXin Li          <t>Required parameters:</t>
497*a58d3d2aSXin Li          <t><list style="hanging">
498*a58d3d2aSXin Li            <t hangText="rate:"> the RTP timestamp is incremented with a
499*a58d3d2aSXin Li            48000 Hz clock rate for all modes of Opus and all sampling
500*a58d3d2aSXin Li            rates. For data encoded with sampling rates other than 48000 Hz,
501*a58d3d2aSXin Li            the sampling rate has to be adjusted to 48000 Hz.
502*a58d3d2aSXin Li          </t>
503*a58d3d2aSXin Li          </list></t>
504*a58d3d2aSXin Li
505*a58d3d2aSXin Li          <t>Optional parameters:</t>
506*a58d3d2aSXin Li
507*a58d3d2aSXin Li          <t><list style="hanging">
508*a58d3d2aSXin Li            <t hangText="maxplaybackrate:">
509*a58d3d2aSXin Li              a hint about the maximum output sampling rate that the receiver is
510*a58d3d2aSXin Li              capable of rendering in Hz.
511*a58d3d2aSXin Li              The decoder MUST be capable of decoding
512*a58d3d2aSXin Li              any audio bandwidth but due to hardware limitations only signals
513*a58d3d2aSXin Li              up to the specified sampling rate can be played back. Sending signals
514*a58d3d2aSXin Li              with higher audio bandwidth results in higher than necessary network
515*a58d3d2aSXin Li              usage and encoding complexity, so an encoder SHOULD NOT encode
516*a58d3d2aSXin Li              frequencies above the audio bandwidth specified by maxplaybackrate.
517*a58d3d2aSXin Li              This parameter can take any value between 8000 and 48000, although
518*a58d3d2aSXin Li              commonly the value will match one of the Opus bandwidths
519*a58d3d2aSXin Li              (<xref target="bandwidth_definitions"/>).
520*a58d3d2aSXin Li              By default, the receiver is assumed to have no limitations, i.e. 48000.
521*a58d3d2aSXin Li              <vspace blankLines='1'/>
522*a58d3d2aSXin Li            </t>
523*a58d3d2aSXin Li
524*a58d3d2aSXin Li            <t hangText="sprop-maxcapturerate:">
525*a58d3d2aSXin Li              a hint about the maximum input sampling rate that the sender is likely to produce.
526*a58d3d2aSXin Li              This is not a guarantee that the sender will never send any higher bandwidth
527*a58d3d2aSXin Li              (e.g. it could send a pre-recorded prompt that uses a higher bandwidth), but it
528*a58d3d2aSXin Li              indicates to the receiver that frequencies above this maximum can safely be discarded.
529*a58d3d2aSXin Li              This parameter is useful to avoid wasting receiver resources by operating the audio
530*a58d3d2aSXin Li              processing pipeline (e.g. echo cancellation) at a higher rate than necessary.
531*a58d3d2aSXin Li              This parameter can take any value between 8000 and 48000, although
532*a58d3d2aSXin Li              commonly the value will match one of the Opus bandwidths
533*a58d3d2aSXin Li              (<xref target="bandwidth_definitions"/>).
534*a58d3d2aSXin Li              By default, the sender is assumed to have no limitations, i.e. 48000.
535*a58d3d2aSXin Li              <vspace blankLines='1'/>
536*a58d3d2aSXin Li            </t>
537*a58d3d2aSXin Li
538*a58d3d2aSXin Li            <t hangText="maxptime:"> the maximum duration of media represented
539*a58d3d2aSXin Li            by a packet (according to Section&nbsp;6 of
540*a58d3d2aSXin Li            <xref target="RFC4566"/>) that a decoder wants to receive, in
541*a58d3d2aSXin Li            milliseconds rounded up to the next full integer value.
542*a58d3d2aSXin Li            Possible values are 3, 5, 10, 20, 40, 60, or an arbitrary
543*a58d3d2aSXin Li            multiple of an Opus frame size rounded up to the next full integer
544*a58d3d2aSXin Li            value, up to a maximum value of 120, as
545*a58d3d2aSXin Li            defined in <xref target='opus-rtp-payload-format'/>. If no value is
546*a58d3d2aSXin Li              specified, the default is 120.
547*a58d3d2aSXin Li              <vspace blankLines='1'/></t>
548*a58d3d2aSXin Li
549*a58d3d2aSXin Li            <t hangText="ptime:"> the preferred duration of media represented
550*a58d3d2aSXin Li            by a packet (according to Section&nbsp;6 of
551*a58d3d2aSXin Li            <xref target="RFC4566"/>) that a decoder wants to receive, in
552*a58d3d2aSXin Li            milliseconds rounded up to the next full integer value.
553*a58d3d2aSXin Li            Possible values are 3, 5, 10, 20, 40, 60, or an arbitrary
554*a58d3d2aSXin Li            multiple of an Opus frame size rounded up to the next full integer
555*a58d3d2aSXin Li            value, up to a maximum value of 120, as defined in <xref
556*a58d3d2aSXin Li            target='opus-rtp-payload-format'/>. If no value is
557*a58d3d2aSXin Li              specified, the default is 20.
558*a58d3d2aSXin Li              <vspace blankLines='1'/></t>
559*a58d3d2aSXin Li
560*a58d3d2aSXin Li            <t hangText="maxaveragebitrate:"> specifies the maximum average
561*a58d3d2aSXin Li            receive bitrate of a session in bits per second (b/s). The actual
562*a58d3d2aSXin Li            value of the bitrate can vary, as it is dependent on the
563*a58d3d2aSXin Li            characteristics of the media in a packet. Note that the maximum
564*a58d3d2aSXin Li            average bitrate MAY be modified dynamically during a session. Any
565*a58d3d2aSXin Li            positive integer is allowed, but values outside the range
566*a58d3d2aSXin Li            6000 to 510000 SHOULD be ignored. If no value is specified, the
567*a58d3d2aSXin Li            maximum value specified in <xref target='bitrate_by_bandwidth'/>
568*a58d3d2aSXin Li            for the corresponding mode of Opus and corresponding maxplaybackrate
569*a58d3d2aSXin Li            is the default.<vspace blankLines='1'/></t>
570*a58d3d2aSXin Li
571*a58d3d2aSXin Li            <t hangText="stereo:">
572*a58d3d2aSXin Li              specifies whether the decoder prefers receiving stereo or mono signals.
573*a58d3d2aSXin Li              Possible values are 1 and 0 where 1 specifies that stereo signals are preferred,
574*a58d3d2aSXin Li              and 0 specifies that only mono signals are preferred.
575*a58d3d2aSXin Li              Independent of the stereo parameter every receiver MUST be able to receive and
576*a58d3d2aSXin Li              decode stereo signals but sending stereo signals to a receiver that signaled a
577*a58d3d2aSXin Li              preference for mono signals may result in higher than necessary network
578*a58d3d2aSXin Li              utilization and encoding complexity. If no value is specified,
579*a58d3d2aSXin Li              the default is 0 (mono).<vspace blankLines='1'/>
580*a58d3d2aSXin Li            </t>
581*a58d3d2aSXin Li
582*a58d3d2aSXin Li            <t hangText="sprop-stereo:">
583*a58d3d2aSXin Li              specifies whether the sender is likely to produce stereo audio.
584*a58d3d2aSXin Li              Possible values are 1 and 0, where 1 specifies that stereo signals are likely to
585*a58d3d2aSXin Li              be sent, and 0 specifies that the sender will likely only send mono.
586*a58d3d2aSXin Li              This is not a guarantee that the sender will never send stereo audio
587*a58d3d2aSXin Li              (e.g. it could send a pre-recorded prompt that uses stereo), but it
588*a58d3d2aSXin Li              indicates to the receiver that the received signal can be safely downmixed to mono.
589*a58d3d2aSXin Li              This parameter is useful to avoid wasting receiver resources by operating the audio
590*a58d3d2aSXin Li              processing pipeline (e.g. echo cancellation) in stereo when not necessary.
591*a58d3d2aSXin Li              If no value is specified, the default is 0
592*a58d3d2aSXin Li              (mono).<vspace blankLines='1'/>
593*a58d3d2aSXin Li            </t>
594*a58d3d2aSXin Li
595*a58d3d2aSXin Li            <t hangText="cbr:">
596*a58d3d2aSXin Li              specifies if the decoder prefers the use of a constant bitrate versus
597*a58d3d2aSXin Li              variable bitrate. Possible values are 1 and 0, where 1 specifies constant
598*a58d3d2aSXin Li              bitrate and 0 specifies variable bitrate. If no value is specified,
599*a58d3d2aSXin Li              the default is 0 (vbr). When cbr is 1, the maximum average bitrate can still
600*a58d3d2aSXin Li              change, e.g. to adapt to changing network conditions.<vspace blankLines='1'/>
601*a58d3d2aSXin Li            </t>
602*a58d3d2aSXin Li
603*a58d3d2aSXin Li            <t hangText="useinbandfec:"> specifies that the decoder has the capability to
604*a58d3d2aSXin Li            take advantage of the Opus in-band FEC. Possible values are 1 and 0.
605*a58d3d2aSXin Li            Providing 0 when FEC cannot be used on the receiving side is
606*a58d3d2aSXin Li            RECOMMENDED. If no
607*a58d3d2aSXin Li            value is specified, useinbandfec is assumed to be 0.
608*a58d3d2aSXin Li            This parameter is only a preference and the receiver MUST be able to process
609*a58d3d2aSXin Li            packets that include FEC information, even if it means the FEC part is discarded.
610*a58d3d2aSXin Li            <vspace blankLines='1'/></t>
611*a58d3d2aSXin Li
612*a58d3d2aSXin Li            <t hangText="usedtx:"> specifies if the decoder prefers the use of
613*a58d3d2aSXin Li            DTX. Possible values are 1 and 0. If no value is specified, the
614*a58d3d2aSXin Li            default is 0.<vspace blankLines='1'/></t>
615*a58d3d2aSXin Li          </list></t>
616*a58d3d2aSXin Li
617*a58d3d2aSXin Li          <t>Encoding considerations:<vspace blankLines='1'/></t>
618*a58d3d2aSXin Li          <t><list style="hanging">
619*a58d3d2aSXin Li            <t>The Opus media type is framed and consists of binary data according
620*a58d3d2aSXin Li            to Section&nbsp;4.8 in <xref target="RFC6838"/>.</t>
621*a58d3d2aSXin Li          </list></t>
622*a58d3d2aSXin Li
623*a58d3d2aSXin Li          <t>Security considerations: </t>
624*a58d3d2aSXin Li          <t><list style="hanging">
625*a58d3d2aSXin Li            <t>See <xref target='security-considerations'/> of this document.</t>
626*a58d3d2aSXin Li          </list></t>
627*a58d3d2aSXin Li
628*a58d3d2aSXin Li          <t>Interoperability considerations: none<vspace blankLines='1'/></t>
629*a58d3d2aSXin Li	  <t>Published specification: RFC [XXXX]</t>
630*a58d3d2aSXin Li	  <t>Note to the RFC Editor: Replace [XXXX] with the number of the published
631*a58d3d2aSXin Li          RFC.<vspace blankLines='1'/></t>
632*a58d3d2aSXin Li
633*a58d3d2aSXin Li          <t>Applications that use this media type: </t>
634*a58d3d2aSXin Li          <t><list style="hanging">
635*a58d3d2aSXin Li            <t>Any application that requires the transport of
636*a58d3d2aSXin Li            speech or audio data can use this media type. Some examples are,
637*a58d3d2aSXin Li            but not limited to, audio and video conferencing, Voice over IP,
638*a58d3d2aSXin Li            media streaming.</t>
639*a58d3d2aSXin Li          </list></t>
640*a58d3d2aSXin Li
641*a58d3d2aSXin Li          <t>Fragment identifier considerations: N/A<vspace blankLines='1'/></t>
642*a58d3d2aSXin Li
643*a58d3d2aSXin Li          <t>Person &amp; email address to contact for further information:</t>
644*a58d3d2aSXin Li          <t><list style="hanging">
645*a58d3d2aSXin Li            <t>SILK Support [email protected]</t>
646*a58d3d2aSXin Li            <t>Jean-Marc Valin [email protected]</t>
647*a58d3d2aSXin Li          </list></t>
648*a58d3d2aSXin Li
649*a58d3d2aSXin Li          <t>Intended usage: COMMON<vspace blankLines='1'/></t>
650*a58d3d2aSXin Li
651*a58d3d2aSXin Li          <t>Restrictions on usage:<vspace blankLines='1'/></t>
652*a58d3d2aSXin Li
653*a58d3d2aSXin Li          <t><list style="hanging">
654*a58d3d2aSXin Li            <t>For transfer over RTP, the RTP payload format (<xref
655*a58d3d2aSXin Li            target='opus-rtp-payload-format'/> of this document) SHALL be
656*a58d3d2aSXin Li            used.</t>
657*a58d3d2aSXin Li          </list></t>
658*a58d3d2aSXin Li
659*a58d3d2aSXin Li          <t>Author:</t>
660*a58d3d2aSXin Li          <t><list style="hanging">
661*a58d3d2aSXin Li            <t>Julian Spittka [email protected]<vspace blankLines='1'/></t>
662*a58d3d2aSXin Li            <t>Koen Vos [email protected]<vspace blankLines='1'/></t>
663*a58d3d2aSXin Li            <t>Jean-Marc Valin [email protected]<vspace blankLines='1'/></t>
664*a58d3d2aSXin Li          </list></t>
665*a58d3d2aSXin Li
666*a58d3d2aSXin Li          <t> Change controller: IETF Payload Working Group delegated from the IESG</t>
667*a58d3d2aSXin Li      </section>
668*a58d3d2aSXin Li    </section>
669*a58d3d2aSXin Li
670*a58d3d2aSXin Li    <section title='SDP Considerations'>
671*a58d3d2aSXin Li        <t>The information described in the media type specification has a
672*a58d3d2aSXin Li        specific mapping to fields in the Session Description Protocol (SDP)
673*a58d3d2aSXin Li        <xref target="RFC4566"/>, which is commonly used to describe RTP
674*a58d3d2aSXin Li        sessions. When SDP is used to specify sessions employing Opus,
675*a58d3d2aSXin Li        the mapping is as follows:</t>
676*a58d3d2aSXin Li
677*a58d3d2aSXin Li        <t>
678*a58d3d2aSXin Li          <list style="symbols">
679*a58d3d2aSXin Li            <t>The media type ("audio") goes in SDP "m=" as the media name.</t>
680*a58d3d2aSXin Li
681*a58d3d2aSXin Li            <t>The media subtype ("opus") goes in SDP "a=rtpmap" as the encoding
682*a58d3d2aSXin Li            name. The RTP clock rate in "a=rtpmap" MUST be 48000 and the number of
683*a58d3d2aSXin Li            channels MUST be 2.</t>
684*a58d3d2aSXin Li
685*a58d3d2aSXin Li            <t>The OPTIONAL media type parameters "ptime" and "maxptime" are
686*a58d3d2aSXin Li            mapped to "a=ptime" and "a=maxptime" attributes, respectively, in the
687*a58d3d2aSXin Li            SDP.</t>
688*a58d3d2aSXin Li
689*a58d3d2aSXin Li            <t>The OPTIONAL media type parameters "maxaveragebitrate",
690*a58d3d2aSXin Li            "maxplaybackrate", "stereo", "cbr", "useinbandfec", and
691*a58d3d2aSXin Li            "usedtx", when present, MUST be included in the "a=fmtp" attribute
692*a58d3d2aSXin Li            in the SDP, expressed as a media type string in the form of a
693*a58d3d2aSXin Li            semicolon-separated list of parameter=value pairs (e.g.,
694*a58d3d2aSXin Li            maxplaybackrate=48000). They MUST NOT be specified in an
695*a58d3d2aSXin Li            SSRC-specific "fmtp" source-level attribute (as defined in
696*a58d3d2aSXin Li            Section&nbsp;6.3 of&nbsp;<xref target="RFC5576"/>).</t>
697*a58d3d2aSXin Li
698*a58d3d2aSXin Li            <t>The OPTIONAL media type parameters "sprop-maxcapturerate",
699*a58d3d2aSXin Li            and "sprop-stereo" MAY be mapped to the "a=fmtp" SDP attribute by
700*a58d3d2aSXin Li            copying them directly from the media type parameter string as part
701*a58d3d2aSXin Li            of the semicolon-separated list of parameter=value pairs (e.g.,
702*a58d3d2aSXin Li            sprop-stereo=1). These same OPTIONAL media type parameters MAY also
703*a58d3d2aSXin Li            be specified using an SSRC-specific "fmtp" source-level attribute
704*a58d3d2aSXin Li            as described in Section&nbsp;6.3 of&nbsp;<xref target="RFC5576"/>.
705*a58d3d2aSXin Li            They MAY be specified in both places, in which case the parameter
706*a58d3d2aSXin Li            in the source-level attribute overrides the one found on the
707*a58d3d2aSXin Li            "a=fmtp" line. The value of any parameter which is not specified in
708*a58d3d2aSXin Li            a source-level source attribute MUST be taken from the "a=fmtp"
709*a58d3d2aSXin Li            line, if it is present there.</t>
710*a58d3d2aSXin Li
711*a58d3d2aSXin Li          </list>
712*a58d3d2aSXin Li        </t>
713*a58d3d2aSXin Li
714*a58d3d2aSXin Li        <t>Below are some examples of SDP session descriptions for Opus:</t>
715*a58d3d2aSXin Li
716*a58d3d2aSXin Li        <t>Example 1: Standard mono session with 48000 Hz clock rate</t>
717*a58d3d2aSXin Li          <figure>
718*a58d3d2aSXin Li            <artwork>
719*a58d3d2aSXin Li              <![CDATA[
720*a58d3d2aSXin Li    m=audio 54312 RTP/AVP 101
721*a58d3d2aSXin Li    a=rtpmap:101 opus/48000/2
722*a58d3d2aSXin Li              ]]>
723*a58d3d2aSXin Li            </artwork>
724*a58d3d2aSXin Li          </figure>
725*a58d3d2aSXin Li
726*a58d3d2aSXin Li
727*a58d3d2aSXin Li        <t>Example 2: 16000 Hz clock rate, maximum packet size of 40 ms,
728*a58d3d2aSXin Li        recommended packet size of 40 ms, maximum average bitrate of 20000 bps,
729*a58d3d2aSXin Li        prefers to receive stereo but only plans to send mono, FEC is desired,
730*a58d3d2aSXin Li        DTX is not desired</t>
731*a58d3d2aSXin Li
732*a58d3d2aSXin Li        <figure>
733*a58d3d2aSXin Li          <artwork>
734*a58d3d2aSXin Li            <![CDATA[
735*a58d3d2aSXin Li    m=audio 54312 RTP/AVP 101
736*a58d3d2aSXin Li    a=rtpmap:101 opus/48000/2
737*a58d3d2aSXin Li    a=fmtp:101 maxplaybackrate=16000; sprop-maxcapturerate=16000;
738*a58d3d2aSXin Li    maxaveragebitrate=20000; stereo=1; useinbandfec=1; usedtx=0
739*a58d3d2aSXin Li    a=ptime:40
740*a58d3d2aSXin Li    a=maxptime:40
741*a58d3d2aSXin Li            ]]>
742*a58d3d2aSXin Li          </artwork>
743*a58d3d2aSXin Li        </figure>
744*a58d3d2aSXin Li
745*a58d3d2aSXin Li        <t>Example 3: Two-way full-band stereo preferred</t>
746*a58d3d2aSXin Li
747*a58d3d2aSXin Li        <figure>
748*a58d3d2aSXin Li          <artwork>
749*a58d3d2aSXin Li            <![CDATA[
750*a58d3d2aSXin Li    m=audio 54312 RTP/AVP 101
751*a58d3d2aSXin Li    a=rtpmap:101 opus/48000/2
752*a58d3d2aSXin Li    a=fmtp:101 stereo=1; sprop-stereo=1
753*a58d3d2aSXin Li            ]]>
754*a58d3d2aSXin Li          </artwork>
755*a58d3d2aSXin Li        </figure>
756*a58d3d2aSXin Li
757*a58d3d2aSXin Li
758*a58d3d2aSXin Li      <section title='SDP Offer/Answer Considerations'>
759*a58d3d2aSXin Li
760*a58d3d2aSXin Li          <t>When using the offer-answer procedure described in <xref
761*a58d3d2aSXin Li          target="RFC3264"/> to negotiate the use of Opus, the following
762*a58d3d2aSXin Li          considerations apply:</t>
763*a58d3d2aSXin Li
764*a58d3d2aSXin Li          <t><list style="symbols">
765*a58d3d2aSXin Li
766*a58d3d2aSXin Li            <t>Opus supports several clock rates. For signaling purposes only
767*a58d3d2aSXin Li            the highest, i.e. 48000, is used. The actual clock rate of the
768*a58d3d2aSXin Li            corresponding media is signaled inside the payload and is not
769*a58d3d2aSXin Li            restricted by this payload format description. The decoder MUST be
770*a58d3d2aSXin Li            capable of decoding every received clock rate. An example
771*a58d3d2aSXin Li            is shown below:
772*a58d3d2aSXin Li
773*a58d3d2aSXin Li            <figure>
774*a58d3d2aSXin Li              <artwork>
775*a58d3d2aSXin Li                <![CDATA[
776*a58d3d2aSXin Li    m=audio 54312 RTP/AVP 100
777*a58d3d2aSXin Li    a=rtpmap:100 opus/48000/2
778*a58d3d2aSXin Li                ]]>
779*a58d3d2aSXin Li              </artwork>
780*a58d3d2aSXin Li            </figure>
781*a58d3d2aSXin Li            </t>
782*a58d3d2aSXin Li
783*a58d3d2aSXin Li            <t>The "ptime" and "maxptime" parameters are unidirectional
784*a58d3d2aSXin Li            receive-only parameters and typically will not compromise
785*a58d3d2aSXin Li            interoperability; however, some values might cause application
786*a58d3d2aSXin Li            performance to suffer. <xref
787*a58d3d2aSXin Li            target="RFC3264"/> defines the SDP offer-answer handling of the
788*a58d3d2aSXin Li            "ptime" parameter. The "maxptime" parameter MUST be handled in the
789*a58d3d2aSXin Li            same way.</t>
790*a58d3d2aSXin Li
791*a58d3d2aSXin Li            <t>
792*a58d3d2aSXin Li              The "maxplaybackrate" parameter is a unidirectional receive-only
793*a58d3d2aSXin Li              parameter that reflects limitations of the local receiver. When
794*a58d3d2aSXin Li              sending to a single destination, a sender MUST NOT use an audio
795*a58d3d2aSXin Li              bandwidth higher than necessary to make full use of audio sampled at
796*a58d3d2aSXin Li              a sampling rate of "maxplaybackrate". Gateways or senders that
797*a58d3d2aSXin Li              are sending the same encoded audio to multiple destinations
798*a58d3d2aSXin Li              SHOULD NOT use an audio bandwidth higher than necessary to
799*a58d3d2aSXin Li              represent audio sampled at "maxplaybackrate", as this would lead
800*a58d3d2aSXin Li              to inefficient use of network resources.
801*a58d3d2aSXin Li              The "maxplaybackrate" parameter does not
802*a58d3d2aSXin Li              affect interoperability. Also, this parameter SHOULD NOT be used
803*a58d3d2aSXin Li              to adjust the audio bandwidth as a function of the bitrate, as this
804*a58d3d2aSXin Li              is the responsibility of the Opus encoder implementation.
805*a58d3d2aSXin Li            </t>
806*a58d3d2aSXin Li
807*a58d3d2aSXin Li            <t>The "maxaveragebitrate" parameter is a unidirectional receive-only
808*a58d3d2aSXin Li            parameter that reflects limitations of the local receiver. The sender
809*a58d3d2aSXin Li            of the other side MUST NOT send with an average bitrate higher than
810*a58d3d2aSXin Li            "maxaveragebitrate" as it might overload the network and/or
811*a58d3d2aSXin Li            receiver. The "maxaveragebitrate" parameter typically will not
812*a58d3d2aSXin Li            compromise interoperability; however, some values might cause
813*a58d3d2aSXin Li            application performance to suffer, and ought to be set with
814*a58d3d2aSXin Li            care.</t>
815*a58d3d2aSXin Li
816*a58d3d2aSXin Li            <t>The "sprop-maxcapturerate" and "sprop-stereo" parameters are
817*a58d3d2aSXin Li            unidirectional sender-only parameters that reflect limitations of
818*a58d3d2aSXin Li            the sender side.
819*a58d3d2aSXin Li            They allow the receiver to set up a reduced-complexity audio
820*a58d3d2aSXin Li            processing pipeline if the  sender is not planning to use the full
821*a58d3d2aSXin Li            range of Opus's capabilities.
822*a58d3d2aSXin Li            Neither "sprop-maxcapturerate" nor "sprop-stereo" affect
823*a58d3d2aSXin Li            interoperability and the receiver MUST be capable of receiving any signal.
824*a58d3d2aSXin Li            </t>
825*a58d3d2aSXin Li
826*a58d3d2aSXin Li            <t>
827*a58d3d2aSXin Li              The "stereo" parameter is a unidirectional receive-only
828*a58d3d2aSXin Li              parameter. When sending to a single destination, a sender MUST
829*a58d3d2aSXin Li              NOT use stereo when "stereo" is 0. Gateways or senders that are
830*a58d3d2aSXin Li              sending the same encoded audio to multiple destinations SHOULD
831*a58d3d2aSXin Li              NOT use stereo when "stereo" is 0, as this would lead to
832*a58d3d2aSXin Li              inefficient use of network resources. The "stereo" parameter does
833*a58d3d2aSXin Li              not affect interoperability.
834*a58d3d2aSXin Li            </t>
835*a58d3d2aSXin Li
836*a58d3d2aSXin Li            <t>
837*a58d3d2aSXin Li              The "cbr" parameter is a unidirectional receive-only
838*a58d3d2aSXin Li              parameter.
839*a58d3d2aSXin Li            </t>
840*a58d3d2aSXin Li
841*a58d3d2aSXin Li            <t>The "useinbandfec" parameter is a unidirectional receive-only
842*a58d3d2aSXin Li            parameter.</t>
843*a58d3d2aSXin Li
844*a58d3d2aSXin Li            <t>The "usedtx" parameter is a unidirectional receive-only
845*a58d3d2aSXin Li            parameter.</t>
846*a58d3d2aSXin Li
847*a58d3d2aSXin Li            <t>Any unknown parameter in an offer MUST be ignored by the receiver
848*a58d3d2aSXin Li            and MUST be removed from the answer.</t>
849*a58d3d2aSXin Li
850*a58d3d2aSXin Li          </list></t>
851*a58d3d2aSXin Li
852*a58d3d2aSXin Li        <t>
853*a58d3d2aSXin Li	  The Opus parameters in an SDP Offer/Answer exchange are completely
854*a58d3d2aSXin Li          orthogonal, and there is no relationship between the SDP Offer and
855*a58d3d2aSXin Li          the Answer.
856*a58d3d2aSXin Li        </t>
857*a58d3d2aSXin Li      </section>
858*a58d3d2aSXin Li
859*a58d3d2aSXin Li      <section title='Declarative SDP Considerations for Opus'>
860*a58d3d2aSXin Li
861*a58d3d2aSXin Li        <t>For declarative use of SDP such as in Session Announcement Protocol
862*a58d3d2aSXin Li        (SAP), <xref target="RFC2974"/>, and RTSP, <xref target="RFC2326"/>, for
863*a58d3d2aSXin Li        Opus, the following needs to be considered:</t>
864*a58d3d2aSXin Li
865*a58d3d2aSXin Li        <t><list style="symbols">
866*a58d3d2aSXin Li
867*a58d3d2aSXin Li          <t>The values for "maxptime", "ptime", "maxplaybackrate", and
868*a58d3d2aSXin Li          "maxaveragebitrate" ought to be selected carefully to ensure that a
869*a58d3d2aSXin Li          reasonable performance can be achieved for the participants of a session.</t>
870*a58d3d2aSXin Li
871*a58d3d2aSXin Li          <t>
872*a58d3d2aSXin Li            The values for "maxptime", "ptime", and of the payload
873*a58d3d2aSXin Li            format configuration are recommendations by the decoding side to ensure
874*a58d3d2aSXin Li            the best performance for the decoder.
875*a58d3d2aSXin Li          </t>
876*a58d3d2aSXin Li
877*a58d3d2aSXin Li          <t>All other parameters of the payload format configuration are declarative
878*a58d3d2aSXin Li          and a participant MUST use the configurations that are provided for
879*a58d3d2aSXin Li          the session. More than one configuration can be provided if necessary
880*a58d3d2aSXin Li          by declaring multiple RTP payload types; however, the number of types
881*a58d3d2aSXin Li          ought to be kept small.</t>
882*a58d3d2aSXin Li        </list></t>
883*a58d3d2aSXin Li      </section>
884*a58d3d2aSXin Li  </section>
885*a58d3d2aSXin Li
886*a58d3d2aSXin Li    <section title='Security Considerations' anchor='security-considerations'>
887*a58d3d2aSXin Li
888*a58d3d2aSXin Li      <t>Use of variable bitrate (VBR) is subject to the security considerations in
889*a58d3d2aSXin Li      <xref target="RFC6562"/>.</t>
890*a58d3d2aSXin Li
891*a58d3d2aSXin Li      <t>RTP packets using the payload format defined in this specification
892*a58d3d2aSXin Li      are subject to the security considerations discussed in the RTP
893*a58d3d2aSXin Li      specification <xref target="RFC3550"/>, and in any applicable RTP profile such as
894*a58d3d2aSXin Li      RTP/AVP <xref target="RFC3551"/>, RTP/AVPF <xref target="RFC4585"/>,
895*a58d3d2aSXin Li      RTP/SAVP <xref target="RFC3711"/> or RTP/SAVPF <xref target="RFC5124"/>.
896*a58d3d2aSXin Li      However, as "Securing the RTP Protocol Framework:
897*a58d3d2aSXin Li      Why RTP Does Not Mandate a Single Media Security Solution"
898*a58d3d2aSXin Li      <xref target="RFC7202"/> discusses, it is not an RTP payload
899*a58d3d2aSXin Li      format's responsibility to discuss or mandate what solutions are used
900*a58d3d2aSXin Li      to meet the basic security goals like confidentiality, integrity and
901*a58d3d2aSXin Li      source authenticity for RTP in general.  This responsibility lays on
902*a58d3d2aSXin Li      anyone using RTP in an application.  They can find guidance on
903*a58d3d2aSXin Li      available security mechanisms and important considerations in Options
904*a58d3d2aSXin Li      for Securing RTP Sessions [I-D.ietf-avtcore-rtp-security-options].
905*a58d3d2aSXin Li      Applications SHOULD use one or more appropriate strong security
906*a58d3d2aSXin Li      mechanisms.</t>
907*a58d3d2aSXin Li
908*a58d3d2aSXin Li      <t>This payload format and the Opus encoding do not exhibit any
909*a58d3d2aSXin Li      significant non-uniformity in the receiver-end computational load and thus
910*a58d3d2aSXin Li      are unlikely to pose a denial-of-service threat due to the receipt of
911*a58d3d2aSXin Li      pathological datagrams.</t>
912*a58d3d2aSXin Li    </section>
913*a58d3d2aSXin Li
914*a58d3d2aSXin Li    <section title='Acknowledgements'>
915*a58d3d2aSXin Li    <t>Many people have made useful comments and suggestions contributing to this document.
916*a58d3d2aSXin Li      In particular, we would like to thank
917*a58d3d2aSXin Li      Tina le Grand, Cullen Jennings, Jonathan Lennox, Gregory Maxwell, Colin Perkins, Jan Skoglund,
918*a58d3d2aSXin Li      Timothy B. Terriberry, Martin Thompson, Justin Uberti, Magnus Westerlund, and Mo Zanaty.</t>
919*a58d3d2aSXin Li    </section>
920*a58d3d2aSXin Li  </middle>
921*a58d3d2aSXin Li
922*a58d3d2aSXin Li  <back>
923*a58d3d2aSXin Li    <references title="Normative References">
924*a58d3d2aSXin Li      &rfc2119;
925*a58d3d2aSXin Li      &rfc3389;
926*a58d3d2aSXin Li      &rfc3550;
927*a58d3d2aSXin Li      &rfc3711;
928*a58d3d2aSXin Li      &rfc3551;
929*a58d3d2aSXin Li      &rfc6838;
930*a58d3d2aSXin Li      &rfc4855;
931*a58d3d2aSXin Li      &rfc4566;
932*a58d3d2aSXin Li      &rfc3264;
933*a58d3d2aSXin Li      &rfc2326;
934*a58d3d2aSXin Li      &rfc5576;
935*a58d3d2aSXin Li      &rfc6562;
936*a58d3d2aSXin Li      &rfc6716;
937*a58d3d2aSXin Li    </references>
938*a58d3d2aSXin Li
939*a58d3d2aSXin Li    <references title="Informative References">
940*a58d3d2aSXin Li      &rfc2974;
941*a58d3d2aSXin Li      &rfc4585;
942*a58d3d2aSXin Li      &rfc5124;
943*a58d3d2aSXin Li      &rfc5405;
944*a58d3d2aSXin Li      &rfc7202;
945*a58d3d2aSXin Li
946*a58d3d2aSXin Li      <reference anchor='rmcat' target='https://datatracker.ietf.org/wg/rmcat/documents/'>
947*a58d3d2aSXin Li        <front>
948*a58d3d2aSXin Li          <title>rmcat documents</title>
949*a58d3d2aSXin Li          <author/>
950*a58d3d2aSXin Li          <date/>
951*a58d3d2aSXin Li          <abstract>
952*a58d3d2aSXin Li            <t></t>
953*a58d3d2aSXin Li          </abstract></front>
954*a58d3d2aSXin Li      </reference>
955*a58d3d2aSXin Li
956*a58d3d2aSXin Li
957*a58d3d2aSXin Li    </references>
958*a58d3d2aSXin Li
959*a58d3d2aSXin Li  </back>
960*a58d3d2aSXin Li</rfc>
961