1// Copyright 2016 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15syntax = "proto3"; 16 17package google.genomics.v1; 18 19import "google/api/annotations.proto"; 20import "google/genomics/v1/cigar.proto"; 21import "google/genomics/v1/position.proto"; 22import "google/protobuf/struct.proto"; 23 24option cc_enable_arenas = true; 25option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics"; 26option java_multiple_files = true; 27option java_outer_classname = "ReadAlignmentProto"; 28option java_package = "com.google.genomics.v1"; 29 30// A linear alignment can be represented by one CIGAR string. Describes the 31// mapped position and local alignment of the read to the reference. 32message LinearAlignment { 33 // The position of this alignment. 34 Position position = 1; 35 36 // The mapping quality of this alignment. Represents how likely 37 // the read maps to this position as opposed to other locations. 38 // 39 // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to 40 // the nearest integer. 41 int32 mapping_quality = 2; 42 43 // Represents the local alignment of this sequence (alignment matches, indels, 44 // etc) against the reference. 45 repeated CigarUnit cigar = 3; 46} 47 48// A read alignment describes a linear alignment of a string of DNA to a 49// [reference sequence][google.genomics.v1.Reference], in addition to metadata 50// about the fragment (the molecule of DNA sequenced) and the read (the bases 51// which were read by the sequencer). A read is equivalent to a line in a SAM 52// file. A read belongs to exactly one read group and exactly one 53// [read group set][google.genomics.v1.ReadGroupSet]. 54// 55// For more genomics resource definitions, see [Fundamentals of Google 56// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) 57// 58// ### Reverse-stranded reads 59// 60// Mapped reads (reads having a non-null `alignment`) can be aligned to either 61// the forward or the reverse strand of their associated reference. Strandedness 62// of a mapped read is encoded by `alignment.position.reverseStrand`. 63// 64// If we consider the reference to be a forward-stranded coordinate space of 65// `[0, reference.length)` with `0` as the left-most position and 66// `reference.length` as the right-most position, reads are always aligned left 67// to right. That is, `alignment.position.position` always refers to the 68// left-most reference coordinate and `alignment.cigar` describes the alignment 69// of this read to the reference from left to right. All per-base fields such as 70// `alignedSequence` and `alignedQuality` share this same left-to-right 71// orientation; this is true of reads which are aligned to either strand. For 72// reverse-stranded reads, this means that `alignedSequence` is the reverse 73// complement of the bases that were originally reported by the sequencing 74// machine. 75// 76// ### Generating a reference-aligned sequence string 77// 78// When interacting with mapped reads, it's often useful to produce a string 79// representing the local alignment of the read to reference. The following 80// pseudocode demonstrates one way of doing this: 81// 82// out = "" 83// offset = 0 84// for c in read.alignment.cigar { 85// switch c.operation { 86// case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH": 87// out += read.alignedSequence[offset:offset+c.operationLength] 88// offset += c.operationLength 89// break 90// case "CLIP_SOFT", "INSERT": 91// offset += c.operationLength 92// break 93// case "PAD": 94// out += repeat("*", c.operationLength) 95// break 96// case "DELETE": 97// out += repeat("-", c.operationLength) 98// break 99// case "SKIP": 100// out += repeat(" ", c.operationLength) 101// break 102// case "CLIP_HARD": 103// break 104// } 105// } 106// return out 107// 108// ### Converting to SAM's CIGAR string 109// 110// The following pseudocode generates a SAM CIGAR string from the 111// `cigar` field. Note that this is a lossy conversion 112// (`cigar.referenceSequence` is lost). 113// 114// cigarMap = { 115// "ALIGNMENT_MATCH": "M", 116// "INSERT": "I", 117// "DELETE": "D", 118// "SKIP": "N", 119// "CLIP_SOFT": "S", 120// "CLIP_HARD": "H", 121// "PAD": "P", 122// "SEQUENCE_MATCH": "=", 123// "SEQUENCE_MISMATCH": "X", 124// } 125// cigarStr = "" 126// for c in read.alignment.cigar { 127// cigarStr += c.operationLength + cigarMap[c.operation] 128// } 129// return cigarStr 130message Read { 131 // The server-generated read ID, unique across all reads. This is different 132 // from the `fragmentName`. 133 string id = 1; 134 135 // The ID of the read group this read belongs to. A read belongs to exactly 136 // one read group. This is a server-generated ID which is distinct from SAM's 137 // RG tag (for that value, see 138 // [ReadGroup.name][google.genomics.v1.ReadGroup.name]). 139 string read_group_id = 2; 140 141 // The ID of the read group set this read belongs to. A read belongs to 142 // exactly one read group set. 143 string read_group_set_id = 3; 144 145 // The fragment name. Equivalent to QNAME (query template name) in SAM. 146 string fragment_name = 4; 147 148 // The orientation and the distance between reads from the fragment are 149 // consistent with the sequencing protocol (SAM flag 0x2). 150 bool proper_placement = 5; 151 152 // The fragment is a PCR or optical duplicate (SAM flag 0x400). 153 bool duplicate_fragment = 6; 154 155 // The observed length of the fragment, equivalent to TLEN in SAM. 156 int32 fragment_length = 7; 157 158 // The read number in sequencing. 0-based and less than numberReads. This 159 // field replaces SAM flag 0x40 and 0x80. 160 int32 read_number = 8; 161 162 // The number of reads in the fragment (extension to SAM flag 0x1). 163 int32 number_reads = 9; 164 165 // Whether this read did not pass filters, such as platform or vendor quality 166 // controls (SAM flag 0x200). 167 bool failed_vendor_quality_checks = 10; 168 169 // The linear alignment for this alignment record. This field is null for 170 // unmapped reads. 171 LinearAlignment alignment = 11; 172 173 // Whether this alignment is secondary. Equivalent to SAM flag 0x100. 174 // A secondary alignment represents an alternative to the primary alignment 175 // for this read. Aligners may return secondary alignments if a read can map 176 // ambiguously to multiple coordinates in the genome. By convention, each read 177 // has one and only one alignment where both `secondaryAlignment` 178 // and `supplementaryAlignment` are false. 179 bool secondary_alignment = 12; 180 181 // Whether this alignment is supplementary. Equivalent to SAM flag 0x800. 182 // Supplementary alignments are used in the representation of a chimeric 183 // alignment. In a chimeric alignment, a read is split into multiple 184 // linear alignments that map to different reference contigs. The first 185 // linear alignment in the read will be designated as the representative 186 // alignment; the remaining linear alignments will be designated as 187 // supplementary alignments. These alignments may have different mapping 188 // quality scores. In each linear alignment in a chimeric alignment, the read 189 // will be hard clipped. The `alignedSequence` and 190 // `alignedQuality` fields in the alignment record will only 191 // represent the bases for its respective linear alignment. 192 bool supplementary_alignment = 13; 193 194 // The bases of the read sequence contained in this alignment record, 195 // **without CIGAR operations applied** (equivalent to SEQ in SAM). 196 // `alignedSequence` and `alignedQuality` may be 197 // shorter than the full read sequence and quality. This will occur if the 198 // alignment is part of a chimeric alignment, or if the read was trimmed. When 199 // this occurs, the CIGAR for this read will begin/end with a hard clip 200 // operator that will indicate the length of the excised sequence. 201 string aligned_sequence = 14; 202 203 // The quality of the read sequence contained in this alignment record 204 // (equivalent to QUAL in SAM). 205 // `alignedSequence` and `alignedQuality` may be shorter than the full read 206 // sequence and quality. This will occur if the alignment is part of a 207 // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR 208 // for this read will begin/end with a hard clip operator that will indicate 209 // the length of the excised sequence. 210 repeated int32 aligned_quality = 15; 211 212 // The mapping of the primary alignment of the 213 // `(readNumber+1)%numberReads` read in the fragment. It replaces 214 // mate position and mate strand in SAM. 215 Position next_mate_position = 16; 216 217 // A map of additional read alignment information. This must be of the form 218 // map<string, string[]> (string key mapping to a list of string values). 219 map<string, google.protobuf.ListValue> info = 17; 220} 221