1 /* 2 * Copyright 2023 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef skgpu_graphite_compute_VelloComputeSteps_DEFINED 9 #define skgpu_graphite_compute_VelloComputeSteps_DEFINED 10 11 #include "include/core/SkColorType.h" 12 #include "include/core/SkSize.h" 13 #include "include/core/SkSpan.h" 14 #include "include/private/base/SkTArray.h" 15 #include "src/gpu/graphite/ComputeTypes.h" 16 #include "src/gpu/graphite/compute/ComputeStep.h" 17 18 #include "third_party/vello/cpp/vello.h" 19 20 #include <string_view> 21 22 namespace skgpu::graphite { 23 24 // This file defines ComputeSteps for all Vello compute stages and their permutations. The 25 // declaration of each ComputeStep subclass mirrors the name of the pipeline stage as defined in the 26 // shader metadata. 27 // 28 // The compute stages all operate over a shared set of buffer and image resources. The 29 // `kVelloSlot_*` constant definitions below each uniquely identify a shared resource that must be 30 // instantiated when assembling the ComputeSteps into a DispatchGroup. 31 // 32 // === Monoids and Prefix Sums === 33 // 34 // Vello's GPU algorithms make repeated use of parallel prefix sums techniques. These occur 35 // frequently in path rasterization (e.g. winding number accummulation across a scanline can be 36 // thought of as per-pixel prefix sums) but Vello also uses them to calculate buffer offsets for 37 // associated entries across its variable length encoding streams. 38 // 39 // For instance, given a scene that contains Bézier paths, each path gets encoded as a transform, 40 // a sequence of path tags (verbs), and zero or more 2-D points associated with each 41 // tag. N paths will often map to N transforms, N + M tags, and N + M + L points (where N > 0, M > 42 // 0, L >= 0). These entries are stored in separate parallel transform, path tag, and path data 43 // streams. The correspondence between entries of these independent streams is implicit. To keep 44 // CPU encoding of these streams fast, the offsets into each buffer for a given "path object" is 45 // computed dynamically and in parallel on the GPU. Since the offsets for each object build 46 // additively on offsets that appear before it in the stream, parallel computation of 47 // offsets can be treated as a dynamic programming problem that maps well to parallel prefix sums 48 // where each object is a "monoid" (https://en.wikipedia.org/wiki/Monoid) that supports algebraic 49 // addition/subtraction over data encoded in the path tags themselves. 50 // 51 // Once computed, a monoid contains the offsets into the input (and sometimes output) buffers for a 52 // given object. The parallel prefix sums operation is defined as a monoidal reduce + pre-scan pair. 53 // (Prefix Sums and Their Applications, Blelloch, G., https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf) 54 // 55 // While these concepts are an implementation detail they are core to the Vello algorithm and are 56 // reflected in the pipeline names and data slot definitions. 57 // 58 // === Full Pipeline === 59 // 60 // The full Vello pipeline stages are as follows and should be dispatched in the following order: 61 // 62 // I. Build the path monoid stream: 63 // If the input fits within the workgroup size: 64 // pathtag_reduce, pathtag_scan_small 65 // else 66 // pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large 67 // 68 // II. Compute path bounding boxes, convert path segments into cubics: 69 // bbox_clear, pathseg 70 // 71 // III. Process the draw object stream to build the draw monoids and inputs to the clip stage: 72 // draw_reduce, draw_leaf 73 // 74 // IV. Compute the bounding boxes for the clip stack from the input stream, if the scene contains 75 // clips: 76 // clip_reduce, clip_leaf 77 // 78 // V. Allocate tile and segment buffers for the individual bins and prepare for coarse rasterization 79 // binning, tile_alloc, path_coarse 80 // 81 // VI. Coarse rasterization 82 // backdrop_dyn, coarse 83 // 84 // VII. Fine rasterization 85 // fine 86 // 87 // TODO: Document the coverage mask pipeline once it has been re-implemented. 88 89 // *** 90 // Shared buffers that are accessed by various stages. 91 // 92 // The render configration uniform buffer. 93 constexpr int kVelloSlot_ConfigUniform = 0; 94 95 // The scene encoding buffer. 96 constexpr int kVelloSlot_Scene = 1; 97 98 // *** 99 // Buffers used during the element processing stage. This stage converts the stream of variable 100 // length path tags, transforms, brushes into a "path monoid" stream containing buffer offsets for 101 // the subsequent stages that associate the input streams with individual draw elements. This stage 102 // performs a parallel prefix sum (reduce + scan) which can be performed in two dispatches if the 103 // entire input can be processed by a single workgroup per dispatch. Otherwise, the algorithm 104 // requires two additional dispatches to continue the traversal (this is due to a lack of primitives 105 // to synchronize execution across workgroups in MSL and WGSL). 106 // 107 // Single pass variant pipelines: pathtag_reduce, pathtag_scan_small 108 // Multi-pass variant pipelines: pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large 109 constexpr int kVelloSlot_TagMonoid = 2; 110 111 // Single pass variant slots: 112 constexpr int kVelloSlot_PathtagReduceOutput = 3; 113 114 // Multi pass variant slots: 115 constexpr int kVelloSlot_LargePathtagReduceFirstPassOutput = kVelloSlot_PathtagReduceOutput; 116 constexpr int kVelloSlot_LargePathtagReduceSecondPassOutput = 4; 117 constexpr int kVelloSlot_LargePathtagScanFirstPassOutput = 5; 118 119 // *** 120 // The second part of element processing flattens path elements (moveTo, lineTo, quadTo, etc) into 121 // an unordered line soup buffer and computes their bounding boxes. This stage is where strokes get 122 // expanded to fills and stroke styles get applied. The output is an unordered "line soup" buffer 123 // and the tight device-space bounding box of each path. 124 // 125 // Pipelines: bbox_clear, flatten 126 constexpr int kVelloSlot_PathBBoxes = 6; 127 constexpr int kVelloSlot_Lines = 7; 128 129 // *** 130 // The next part prepares the draw object stream (entries in the per-tile command list aka PTCL) 131 // and additional metadata for the subsequent clipping and binning stages. 132 // 133 // Pipelines: draw_reduce, draw_leaf 134 constexpr int kVelloSlot_DrawReduceOutput = 8; 135 constexpr int kVelloSlot_DrawMonoid = 9; 136 constexpr int kVelloSlot_InfoBinData = 10; 137 constexpr int kVelloSlot_ClipInput = 11; 138 139 // *** 140 // Clipping. The outputs of this stage are the finalized draw monoid and the clip bounding-boxes. 141 // Clipping involves evaluating the stack monoid: refer to the following paper for the meaning of 142 // these buffers: https://arxiv.org/pdf/2205.11659.pdf, 143 // https://en.wikipedia.org/wiki/Bicyclic_semigroup 144 // 145 // Pipelines: clip_reduce, clip_leaf 146 constexpr int kVelloSlot_ClipBicyclic = 12; 147 constexpr int kVelloSlot_ClipElement = 13; 148 constexpr int kVelloSlot_ClipBBoxes = 14; 149 150 // *** 151 // Buffers containing bump allocated data, the inputs and outputs to the binning, coarse raster, and 152 // per-tile segment assembly stages. 153 // 154 // Pipelines: binning, tile_alloc, path_count, backdrop, coarse, path_tiling 155 constexpr int kVelloSlot_DrawBBoxes = 15; 156 constexpr int kVelloSlot_BumpAlloc = 16; 157 constexpr int kVelloSlot_BinHeader = 17; 158 159 constexpr int kVelloSlot_Path = 18; 160 constexpr int kVelloSlot_Tile = 19; 161 constexpr int kVelloSlot_SegmentCounts = 20; 162 constexpr int kVelloSlot_Segments = 21; 163 constexpr int kVelloSlot_PTCL = 22; 164 165 // *** 166 // Texture resources used by the fine rasterization stage. The gradient image needs to get populated 167 // on the CPU with pre-computed gradient ramps. The image atlas is intended to hold pre-uploaded 168 // images that are composited into the scene. 169 // 170 // The output image contains the final render. 171 constexpr int kVelloSlot_OutputImage = 23; 172 constexpr int kVelloSlot_GradientImage = 24; 173 constexpr int kVelloSlot_ImageAtlas = 25; 174 175 // *** 176 // The indirect count buffer is used to issue an indirect dispatch of the path count and path tiling 177 // stages. 178 constexpr int kVelloSlot_IndirectCount = 26; 179 180 // *** 181 // The sample mask lookup table used in MSAA modes of the fine rasterization stage. 182 constexpr int kVelloSlot_MaskLUT = 27; 183 184 std::string_view VelloStageName(vello_cpp::ShaderStage); 185 WorkgroupSize VelloStageLocalSize(vello_cpp::ShaderStage); 186 skia_private::TArray<ComputeStep::WorkgroupBufferDesc> VelloWorkgroupBuffers( 187 vello_cpp::ShaderStage); 188 ComputeStep::NativeShaderSource VelloNativeShaderSource(vello_cpp::ShaderStage, 189 ComputeStep::NativeShaderFormat); 190 191 template <vello_cpp::ShaderStage S> 192 class VelloStep : public ComputeStep { 193 public: 194 ~VelloStep() override = default; 195 nativeShaderSource(NativeShaderFormat format)196 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override { 197 return VelloNativeShaderSource(S, format); 198 } 199 200 protected: VelloStep(SkSpan<const ResourceDesc> resources)201 explicit VelloStep(SkSpan<const ResourceDesc> resources) 202 : ComputeStep(VelloStageName(S), 203 VelloStageLocalSize(S), 204 resources, 205 AsSpan<ComputeStep::WorkgroupBufferDesc>(VelloWorkgroupBuffers(S)), 206 Flags::kSupportsNativeShader) {} 207 208 private: 209 // Helper that creates a SkSpan from a universal reference to a container. Generally, creating a 210 // SkSpan from an rvalue reference is not safe since the pointer stored in the SkSpan will 211 // dangle beyond the constructor expression. In our usage in the constructor above, 212 // the lifetime of the temporary TArray should match that of the SkSpan, both of which should 213 // live through the constructor call expression. 214 // 215 // From https://en.cppreference.com/w/cpp/language/reference_initialization#Lifetime_of_a_temporary: 216 // 217 // a temporary bound to a reference parameter in a function call exists until the end of the 218 // full expression containing that function call 219 // 220 template <typename T, typename C> AsSpan(C && container)221 static SkSpan<const T> AsSpan(C&& container) { 222 return SkSpan(std::data(container), std::size(container)); 223 } 224 }; 225 226 #define VELLO_COMPUTE_STEP(stage) \ 227 class Vello##stage##Step final : public VelloStep<vello_cpp::ShaderStage::stage> { \ 228 public: \ 229 Vello##stage##Step(); \ 230 }; 231 232 VELLO_COMPUTE_STEP(BackdropDyn); 233 VELLO_COMPUTE_STEP(BboxClear); 234 VELLO_COMPUTE_STEP(Binning); 235 VELLO_COMPUTE_STEP(ClipLeaf); 236 VELLO_COMPUTE_STEP(ClipReduce); 237 VELLO_COMPUTE_STEP(Coarse); 238 VELLO_COMPUTE_STEP(Flatten); 239 VELLO_COMPUTE_STEP(DrawLeaf); 240 VELLO_COMPUTE_STEP(DrawReduce); 241 VELLO_COMPUTE_STEP(PathCount); 242 VELLO_COMPUTE_STEP(PathCountSetup); 243 VELLO_COMPUTE_STEP(PathTiling); 244 VELLO_COMPUTE_STEP(PathTilingSetup); 245 VELLO_COMPUTE_STEP(PathtagReduce); 246 VELLO_COMPUTE_STEP(PathtagReduce2); 247 VELLO_COMPUTE_STEP(PathtagScan1); 248 VELLO_COMPUTE_STEP(PathtagScanLarge); 249 VELLO_COMPUTE_STEP(PathtagScanSmall); 250 VELLO_COMPUTE_STEP(TileAlloc); 251 252 #undef VELLO_COMPUTE_STEP 253 254 template <vello_cpp::ShaderStage S, SkColorType T> class VelloFineStepBase : public VelloStep<S> { 255 public: 256 // We need to return a texture format for the bound textures. calculateTextureParameters(int index,const ComputeStep::ResourceDesc &)257 std::tuple<SkISize, SkColorType> calculateTextureParameters( 258 int index, const ComputeStep::ResourceDesc&) const override { 259 SkASSERT(index == 4); 260 // TODO: The texture dimensions are unknown here so this method returns 0 for the texture 261 // size. In this case this field is unused since VelloRenderer assigns texture resources 262 // directly to the DispatchGroupBuilder. The format must still be queried to describe the 263 // ComputeStep's binding layout. This method could be improved to enable conditional 264 // querying of optional/dynamic parameters. 265 return {{}, T}; 266 } 267 268 protected: VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)269 explicit VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources) 270 : VelloStep<S>(resources) {} 271 }; 272 273 template <vello_cpp::ShaderStage S, SkColorType T, ::rust::Vec<uint8_t> (*MaskLutBuilder)()> 274 class VelloFineMsaaStepBase : public VelloFineStepBase<S, T> { 275 public: calculateBufferSize(int resourceIndex,const ComputeStep::ResourceDesc &)276 size_t calculateBufferSize(int resourceIndex, const ComputeStep::ResourceDesc&) const override { 277 SkASSERT(resourceIndex == 5); 278 return fMaskLut.size(); 279 } 280 prepareStorageBuffer(int resourceIndex,const ComputeStep::ResourceDesc &,void * buffer,size_t bufferSize)281 void prepareStorageBuffer(int resourceIndex, 282 const ComputeStep::ResourceDesc&, 283 void* buffer, 284 size_t bufferSize) const override { 285 SkASSERT(resourceIndex == 5); 286 SkASSERT(fMaskLut.size() == bufferSize); 287 memcpy(buffer, fMaskLut.data(), fMaskLut.size()); 288 } 289 290 protected: VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)291 explicit VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources) 292 : VelloFineStepBase<S, T>(resources), fMaskLut(MaskLutBuilder()) {} 293 294 private: 295 ::rust::Vec<uint8_t> fMaskLut; 296 }; 297 298 class VelloFineAreaStep final 299 : public VelloFineStepBase<vello_cpp::ShaderStage::FineArea, kRGBA_8888_SkColorType> { 300 public: 301 VelloFineAreaStep(); 302 }; 303 304 class VelloFineAreaAlpha8Step final 305 : public VelloFineStepBase<vello_cpp::ShaderStage::FineAreaR8, kAlpha_8_SkColorType> { 306 public: 307 VelloFineAreaAlpha8Step(); 308 }; 309 310 class VelloFineMsaa16Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16, 311 kRGBA_8888_SkColorType, 312 vello_cpp::build_mask_lut_16> { 313 public: 314 VelloFineMsaa16Step(); 315 }; 316 317 class VelloFineMsaa16Alpha8Step final 318 : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16R8, 319 kAlpha_8_SkColorType, 320 vello_cpp::build_mask_lut_16> { 321 public: 322 VelloFineMsaa16Alpha8Step(); 323 }; 324 325 class VelloFineMsaa8Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8, 326 kRGBA_8888_SkColorType, 327 vello_cpp::build_mask_lut_8> { 328 public: 329 VelloFineMsaa8Step(); 330 }; 331 332 class VelloFineMsaa8Alpha8Step final 333 : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8R8, 334 kAlpha_8_SkColorType, 335 vello_cpp::build_mask_lut_8> { 336 public: 337 VelloFineMsaa8Alpha8Step(); 338 }; 339 340 } // namespace skgpu::graphite 341 342 #endif // skgpu_graphite_compute_VelloComputeSteps_DEFINED 343