xref: /aosp_15_r20/external/googleapis/google/genomics/v1/readalignment.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1*d5c09012SAndroid Build Coastguard Worker// Copyright 2016 Google Inc.
2*d5c09012SAndroid Build Coastguard Worker//
3*d5c09012SAndroid Build Coastguard Worker// Licensed under the Apache License, Version 2.0 (the "License");
4*d5c09012SAndroid Build Coastguard Worker// you may not use this file except in compliance with the License.
5*d5c09012SAndroid Build Coastguard Worker// You may obtain a copy of the License at
6*d5c09012SAndroid Build Coastguard Worker//
7*d5c09012SAndroid Build Coastguard Worker//     http://www.apache.org/licenses/LICENSE-2.0
8*d5c09012SAndroid Build Coastguard Worker//
9*d5c09012SAndroid Build Coastguard Worker// Unless required by applicable law or agreed to in writing, software
10*d5c09012SAndroid Build Coastguard Worker// distributed under the License is distributed on an "AS IS" BASIS,
11*d5c09012SAndroid Build Coastguard Worker// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*d5c09012SAndroid Build Coastguard Worker// See the License for the specific language governing permissions and
13*d5c09012SAndroid Build Coastguard Worker// limitations under the License.
14*d5c09012SAndroid Build Coastguard Worker
15*d5c09012SAndroid Build Coastguard Workersyntax = "proto3";
16*d5c09012SAndroid Build Coastguard Worker
17*d5c09012SAndroid Build Coastguard Workerpackage google.genomics.v1;
18*d5c09012SAndroid Build Coastguard Worker
19*d5c09012SAndroid Build Coastguard Workerimport "google/api/annotations.proto";
20*d5c09012SAndroid Build Coastguard Workerimport "google/genomics/v1/cigar.proto";
21*d5c09012SAndroid Build Coastguard Workerimport "google/genomics/v1/position.proto";
22*d5c09012SAndroid Build Coastguard Workerimport "google/protobuf/struct.proto";
23*d5c09012SAndroid Build Coastguard Worker
24*d5c09012SAndroid Build Coastguard Workeroption cc_enable_arenas = true;
25*d5c09012SAndroid Build Coastguard Workeroption go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
26*d5c09012SAndroid Build Coastguard Workeroption java_multiple_files = true;
27*d5c09012SAndroid Build Coastguard Workeroption java_outer_classname = "ReadAlignmentProto";
28*d5c09012SAndroid Build Coastguard Workeroption java_package = "com.google.genomics.v1";
29*d5c09012SAndroid Build Coastguard Worker
30*d5c09012SAndroid Build Coastguard Worker// A linear alignment can be represented by one CIGAR string. Describes the
31*d5c09012SAndroid Build Coastguard Worker// mapped position and local alignment of the read to the reference.
32*d5c09012SAndroid Build Coastguard Workermessage LinearAlignment {
33*d5c09012SAndroid Build Coastguard Worker  // The position of this alignment.
34*d5c09012SAndroid Build Coastguard Worker  Position position = 1;
35*d5c09012SAndroid Build Coastguard Worker
36*d5c09012SAndroid Build Coastguard Worker  // The mapping quality of this alignment. Represents how likely
37*d5c09012SAndroid Build Coastguard Worker  // the read maps to this position as opposed to other locations.
38*d5c09012SAndroid Build Coastguard Worker  //
39*d5c09012SAndroid Build Coastguard Worker  // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
40*d5c09012SAndroid Build Coastguard Worker  // the nearest integer.
41*d5c09012SAndroid Build Coastguard Worker  int32 mapping_quality = 2;
42*d5c09012SAndroid Build Coastguard Worker
43*d5c09012SAndroid Build Coastguard Worker  // Represents the local alignment of this sequence (alignment matches, indels,
44*d5c09012SAndroid Build Coastguard Worker  // etc) against the reference.
45*d5c09012SAndroid Build Coastguard Worker  repeated CigarUnit cigar = 3;
46*d5c09012SAndroid Build Coastguard Worker}
47*d5c09012SAndroid Build Coastguard Worker
48*d5c09012SAndroid Build Coastguard Worker// A read alignment describes a linear alignment of a string of DNA to a
49*d5c09012SAndroid Build Coastguard Worker// [reference sequence][google.genomics.v1.Reference], in addition to metadata
50*d5c09012SAndroid Build Coastguard Worker// about the fragment (the molecule of DNA sequenced) and the read (the bases
51*d5c09012SAndroid Build Coastguard Worker// which were read by the sequencer). A read is equivalent to a line in a SAM
52*d5c09012SAndroid Build Coastguard Worker// file. A read belongs to exactly one read group and exactly one
53*d5c09012SAndroid Build Coastguard Worker// [read group set][google.genomics.v1.ReadGroupSet].
54*d5c09012SAndroid Build Coastguard Worker//
55*d5c09012SAndroid Build Coastguard Worker// For more genomics resource definitions, see [Fundamentals of Google
56*d5c09012SAndroid Build Coastguard Worker// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
57*d5c09012SAndroid Build Coastguard Worker//
58*d5c09012SAndroid Build Coastguard Worker// ### Reverse-stranded reads
59*d5c09012SAndroid Build Coastguard Worker//
60*d5c09012SAndroid Build Coastguard Worker// Mapped reads (reads having a non-null `alignment`) can be aligned to either
61*d5c09012SAndroid Build Coastguard Worker// the forward or the reverse strand of their associated reference. Strandedness
62*d5c09012SAndroid Build Coastguard Worker// of a mapped read is encoded by `alignment.position.reverseStrand`.
63*d5c09012SAndroid Build Coastguard Worker//
64*d5c09012SAndroid Build Coastguard Worker// If we consider the reference to be a forward-stranded coordinate space of
65*d5c09012SAndroid Build Coastguard Worker// `[0, reference.length)` with `0` as the left-most position and
66*d5c09012SAndroid Build Coastguard Worker// `reference.length` as the right-most position, reads are always aligned left
67*d5c09012SAndroid Build Coastguard Worker// to right. That is, `alignment.position.position` always refers to the
68*d5c09012SAndroid Build Coastguard Worker// left-most reference coordinate and `alignment.cigar` describes the alignment
69*d5c09012SAndroid Build Coastguard Worker// of this read to the reference from left to right. All per-base fields such as
70*d5c09012SAndroid Build Coastguard Worker// `alignedSequence` and `alignedQuality` share this same left-to-right
71*d5c09012SAndroid Build Coastguard Worker// orientation; this is true of reads which are aligned to either strand. For
72*d5c09012SAndroid Build Coastguard Worker// reverse-stranded reads, this means that `alignedSequence` is the reverse
73*d5c09012SAndroid Build Coastguard Worker// complement of the bases that were originally reported by the sequencing
74*d5c09012SAndroid Build Coastguard Worker// machine.
75*d5c09012SAndroid Build Coastguard Worker//
76*d5c09012SAndroid Build Coastguard Worker// ### Generating a reference-aligned sequence string
77*d5c09012SAndroid Build Coastguard Worker//
78*d5c09012SAndroid Build Coastguard Worker// When interacting with mapped reads, it's often useful to produce a string
79*d5c09012SAndroid Build Coastguard Worker// representing the local alignment of the read to reference. The following
80*d5c09012SAndroid Build Coastguard Worker// pseudocode demonstrates one way of doing this:
81*d5c09012SAndroid Build Coastguard Worker//
82*d5c09012SAndroid Build Coastguard Worker//     out = ""
83*d5c09012SAndroid Build Coastguard Worker//     offset = 0
84*d5c09012SAndroid Build Coastguard Worker//     for c in read.alignment.cigar {
85*d5c09012SAndroid Build Coastguard Worker//       switch c.operation {
86*d5c09012SAndroid Build Coastguard Worker//       case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
87*d5c09012SAndroid Build Coastguard Worker//         out += read.alignedSequence[offset:offset+c.operationLength]
88*d5c09012SAndroid Build Coastguard Worker//         offset += c.operationLength
89*d5c09012SAndroid Build Coastguard Worker//         break
90*d5c09012SAndroid Build Coastguard Worker//       case "CLIP_SOFT", "INSERT":
91*d5c09012SAndroid Build Coastguard Worker//         offset += c.operationLength
92*d5c09012SAndroid Build Coastguard Worker//         break
93*d5c09012SAndroid Build Coastguard Worker//       case "PAD":
94*d5c09012SAndroid Build Coastguard Worker//         out += repeat("*", c.operationLength)
95*d5c09012SAndroid Build Coastguard Worker//         break
96*d5c09012SAndroid Build Coastguard Worker//       case "DELETE":
97*d5c09012SAndroid Build Coastguard Worker//         out += repeat("-", c.operationLength)
98*d5c09012SAndroid Build Coastguard Worker//         break
99*d5c09012SAndroid Build Coastguard Worker//       case "SKIP":
100*d5c09012SAndroid Build Coastguard Worker//         out += repeat(" ", c.operationLength)
101*d5c09012SAndroid Build Coastguard Worker//         break
102*d5c09012SAndroid Build Coastguard Worker//       case "CLIP_HARD":
103*d5c09012SAndroid Build Coastguard Worker//         break
104*d5c09012SAndroid Build Coastguard Worker//       }
105*d5c09012SAndroid Build Coastguard Worker//     }
106*d5c09012SAndroid Build Coastguard Worker//     return out
107*d5c09012SAndroid Build Coastguard Worker//
108*d5c09012SAndroid Build Coastguard Worker// ### Converting to SAM's CIGAR string
109*d5c09012SAndroid Build Coastguard Worker//
110*d5c09012SAndroid Build Coastguard Worker// The following pseudocode generates a SAM CIGAR string from the
111*d5c09012SAndroid Build Coastguard Worker// `cigar` field. Note that this is a lossy conversion
112*d5c09012SAndroid Build Coastguard Worker// (`cigar.referenceSequence` is lost).
113*d5c09012SAndroid Build Coastguard Worker//
114*d5c09012SAndroid Build Coastguard Worker//     cigarMap = {
115*d5c09012SAndroid Build Coastguard Worker//       "ALIGNMENT_MATCH": "M",
116*d5c09012SAndroid Build Coastguard Worker//       "INSERT": "I",
117*d5c09012SAndroid Build Coastguard Worker//       "DELETE": "D",
118*d5c09012SAndroid Build Coastguard Worker//       "SKIP": "N",
119*d5c09012SAndroid Build Coastguard Worker//       "CLIP_SOFT": "S",
120*d5c09012SAndroid Build Coastguard Worker//       "CLIP_HARD": "H",
121*d5c09012SAndroid Build Coastguard Worker//       "PAD": "P",
122*d5c09012SAndroid Build Coastguard Worker//       "SEQUENCE_MATCH": "=",
123*d5c09012SAndroid Build Coastguard Worker//       "SEQUENCE_MISMATCH": "X",
124*d5c09012SAndroid Build Coastguard Worker//     }
125*d5c09012SAndroid Build Coastguard Worker//     cigarStr = ""
126*d5c09012SAndroid Build Coastguard Worker//     for c in read.alignment.cigar {
127*d5c09012SAndroid Build Coastguard Worker//       cigarStr += c.operationLength + cigarMap[c.operation]
128*d5c09012SAndroid Build Coastguard Worker//     }
129*d5c09012SAndroid Build Coastguard Worker//     return cigarStr
130*d5c09012SAndroid Build Coastguard Workermessage Read {
131*d5c09012SAndroid Build Coastguard Worker  // The server-generated read ID, unique across all reads. This is different
132*d5c09012SAndroid Build Coastguard Worker  // from the `fragmentName`.
133*d5c09012SAndroid Build Coastguard Worker  string id = 1;
134*d5c09012SAndroid Build Coastguard Worker
135*d5c09012SAndroid Build Coastguard Worker  // The ID of the read group this read belongs to. A read belongs to exactly
136*d5c09012SAndroid Build Coastguard Worker  // one read group. This is a server-generated ID which is distinct from SAM's
137*d5c09012SAndroid Build Coastguard Worker  // RG tag (for that value, see
138*d5c09012SAndroid Build Coastguard Worker  // [ReadGroup.name][google.genomics.v1.ReadGroup.name]).
139*d5c09012SAndroid Build Coastguard Worker  string read_group_id = 2;
140*d5c09012SAndroid Build Coastguard Worker
141*d5c09012SAndroid Build Coastguard Worker  // The ID of the read group set this read belongs to. A read belongs to
142*d5c09012SAndroid Build Coastguard Worker  // exactly one read group set.
143*d5c09012SAndroid Build Coastguard Worker  string read_group_set_id = 3;
144*d5c09012SAndroid Build Coastguard Worker
145*d5c09012SAndroid Build Coastguard Worker  // The fragment name. Equivalent to QNAME (query template name) in SAM.
146*d5c09012SAndroid Build Coastguard Worker  string fragment_name = 4;
147*d5c09012SAndroid Build Coastguard Worker
148*d5c09012SAndroid Build Coastguard Worker  // The orientation and the distance between reads from the fragment are
149*d5c09012SAndroid Build Coastguard Worker  // consistent with the sequencing protocol (SAM flag 0x2).
150*d5c09012SAndroid Build Coastguard Worker  bool proper_placement = 5;
151*d5c09012SAndroid Build Coastguard Worker
152*d5c09012SAndroid Build Coastguard Worker  // The fragment is a PCR or optical duplicate (SAM flag 0x400).
153*d5c09012SAndroid Build Coastguard Worker  bool duplicate_fragment = 6;
154*d5c09012SAndroid Build Coastguard Worker
155*d5c09012SAndroid Build Coastguard Worker  // The observed length of the fragment, equivalent to TLEN in SAM.
156*d5c09012SAndroid Build Coastguard Worker  int32 fragment_length = 7;
157*d5c09012SAndroid Build Coastguard Worker
158*d5c09012SAndroid Build Coastguard Worker  // The read number in sequencing. 0-based and less than numberReads. This
159*d5c09012SAndroid Build Coastguard Worker  // field replaces SAM flag 0x40 and 0x80.
160*d5c09012SAndroid Build Coastguard Worker  int32 read_number = 8;
161*d5c09012SAndroid Build Coastguard Worker
162*d5c09012SAndroid Build Coastguard Worker  // The number of reads in the fragment (extension to SAM flag 0x1).
163*d5c09012SAndroid Build Coastguard Worker  int32 number_reads = 9;
164*d5c09012SAndroid Build Coastguard Worker
165*d5c09012SAndroid Build Coastguard Worker  // Whether this read did not pass filters, such as platform or vendor quality
166*d5c09012SAndroid Build Coastguard Worker  // controls (SAM flag 0x200).
167*d5c09012SAndroid Build Coastguard Worker  bool failed_vendor_quality_checks = 10;
168*d5c09012SAndroid Build Coastguard Worker
169*d5c09012SAndroid Build Coastguard Worker  // The linear alignment for this alignment record. This field is null for
170*d5c09012SAndroid Build Coastguard Worker  // unmapped reads.
171*d5c09012SAndroid Build Coastguard Worker  LinearAlignment alignment = 11;
172*d5c09012SAndroid Build Coastguard Worker
173*d5c09012SAndroid Build Coastguard Worker  // Whether this alignment is secondary. Equivalent to SAM flag 0x100.
174*d5c09012SAndroid Build Coastguard Worker  // A secondary alignment represents an alternative to the primary alignment
175*d5c09012SAndroid Build Coastguard Worker  // for this read. Aligners may return secondary alignments if a read can map
176*d5c09012SAndroid Build Coastguard Worker  // ambiguously to multiple coordinates in the genome. By convention, each read
177*d5c09012SAndroid Build Coastguard Worker  // has one and only one alignment where both `secondaryAlignment`
178*d5c09012SAndroid Build Coastguard Worker  // and `supplementaryAlignment` are false.
179*d5c09012SAndroid Build Coastguard Worker  bool secondary_alignment = 12;
180*d5c09012SAndroid Build Coastguard Worker
181*d5c09012SAndroid Build Coastguard Worker  // Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
182*d5c09012SAndroid Build Coastguard Worker  // Supplementary alignments are used in the representation of a chimeric
183*d5c09012SAndroid Build Coastguard Worker  // alignment. In a chimeric alignment, a read is split into multiple
184*d5c09012SAndroid Build Coastguard Worker  // linear alignments that map to different reference contigs. The first
185*d5c09012SAndroid Build Coastguard Worker  // linear alignment in the read will be designated as the representative
186*d5c09012SAndroid Build Coastguard Worker  // alignment; the remaining linear alignments will be designated as
187*d5c09012SAndroid Build Coastguard Worker  // supplementary alignments. These alignments may have different mapping
188*d5c09012SAndroid Build Coastguard Worker  // quality scores. In each linear alignment in a chimeric alignment, the read
189*d5c09012SAndroid Build Coastguard Worker  // will be hard clipped. The `alignedSequence` and
190*d5c09012SAndroid Build Coastguard Worker  // `alignedQuality` fields in the alignment record will only
191*d5c09012SAndroid Build Coastguard Worker  // represent the bases for its respective linear alignment.
192*d5c09012SAndroid Build Coastguard Worker  bool supplementary_alignment = 13;
193*d5c09012SAndroid Build Coastguard Worker
194*d5c09012SAndroid Build Coastguard Worker  // The bases of the read sequence contained in this alignment record,
195*d5c09012SAndroid Build Coastguard Worker  // **without CIGAR operations applied** (equivalent to SEQ in SAM).
196*d5c09012SAndroid Build Coastguard Worker  // `alignedSequence` and `alignedQuality` may be
197*d5c09012SAndroid Build Coastguard Worker  // shorter than the full read sequence and quality. This will occur if the
198*d5c09012SAndroid Build Coastguard Worker  // alignment is part of a chimeric alignment, or if the read was trimmed. When
199*d5c09012SAndroid Build Coastguard Worker  // this occurs, the CIGAR for this read will begin/end with a hard clip
200*d5c09012SAndroid Build Coastguard Worker  // operator that will indicate the length of the excised sequence.
201*d5c09012SAndroid Build Coastguard Worker  string aligned_sequence = 14;
202*d5c09012SAndroid Build Coastguard Worker
203*d5c09012SAndroid Build Coastguard Worker  // The quality of the read sequence contained in this alignment record
204*d5c09012SAndroid Build Coastguard Worker  // (equivalent to QUAL in SAM).
205*d5c09012SAndroid Build Coastguard Worker  // `alignedSequence` and `alignedQuality` may be shorter than the full read
206*d5c09012SAndroid Build Coastguard Worker  // sequence and quality. This will occur if the alignment is part of a
207*d5c09012SAndroid Build Coastguard Worker  // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
208*d5c09012SAndroid Build Coastguard Worker  // for this read will begin/end with a hard clip operator that will indicate
209*d5c09012SAndroid Build Coastguard Worker  // the length of the excised sequence.
210*d5c09012SAndroid Build Coastguard Worker  repeated int32 aligned_quality = 15;
211*d5c09012SAndroid Build Coastguard Worker
212*d5c09012SAndroid Build Coastguard Worker  // The mapping of the primary alignment of the
213*d5c09012SAndroid Build Coastguard Worker  // `(readNumber+1)%numberReads` read in the fragment. It replaces
214*d5c09012SAndroid Build Coastguard Worker  // mate position and mate strand in SAM.
215*d5c09012SAndroid Build Coastguard Worker  Position next_mate_position = 16;
216*d5c09012SAndroid Build Coastguard Worker
217*d5c09012SAndroid Build Coastguard Worker  // A map of additional read alignment information. This must be of the form
218*d5c09012SAndroid Build Coastguard Worker  // map<string, string[]> (string key mapping to a list of string values).
219*d5c09012SAndroid Build Coastguard Worker  map<string, google.protobuf.ListValue> info = 17;
220*d5c09012SAndroid Build Coastguard Worker}
221