1 /*
2 * Copyright 2023 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "src/gpu/graphite/compute/VelloRenderer.h"
9
10 #include "include/core/SkPath.h"
11 #include "include/core/SkTypes.h"
12 #include "include/gpu/graphite/Recorder.h"
13 #include "src/core/SkGeometry.h"
14 #include "src/core/SkPathPriv.h"
15 #include "src/core/SkTraceEvent.h"
16 #include "src/gpu/graphite/BufferManager.h"
17 #include "src/gpu/graphite/Caps.h"
18 #include "src/gpu/graphite/DrawParams.h"
19 #include "src/gpu/graphite/Log.h"
20 #include "src/gpu/graphite/PipelineData.h"
21 #include "src/gpu/graphite/RecorderPriv.h"
22 #include "src/gpu/graphite/TextureProxy.h"
23 #include "src/gpu/graphite/TextureUtils.h"
24 #include "src/gpu/graphite/UniformManager.h"
25 #include "src/gpu/graphite/compute/DispatchGroup.h"
26
27 #include <algorithm>
28
29 namespace skgpu::graphite {
30 namespace {
31
new_scratch_slice(ScratchBuffer & scratch)32 BindBufferInfo new_scratch_slice(ScratchBuffer& scratch) {
33 size_t size = scratch.size(); // Use the whole buffer.
34 return scratch.suballocate(size);
35 }
36
new_indirect_slice(DrawBufferManager * mgr,size_t size)37 BindBufferInfo new_indirect_slice(DrawBufferManager* mgr, size_t size) {
38 return mgr->getIndirectStorage(size, ClearBuffer::kYes);
39 }
40
to_slice(void * ptr,size_t size)41 ::rust::Slice<uint8_t> to_slice(void* ptr, size_t size) {
42 return {static_cast<uint8_t*>(ptr), size};
43 }
44
to_vello_affine(const SkMatrix & m)45 vello_cpp::Affine to_vello_affine(const SkMatrix& m) {
46 // Vello currently doesn't support perspective scaling and the encoding only accepts a 2x3
47 // affine transform matrix.
48 return {{m.get(0), m.get(3), m.get(1), m.get(4), m.get(2), m.get(5)}};
49 }
50
to_vello_point(const SkPoint & p)51 vello_cpp::Point to_vello_point(const SkPoint& p) { return {p.x(), p.y()}; }
52
to_vello_color(const SkColor4f & color)53 vello_cpp::Color to_vello_color(const SkColor4f& color) {
54 SkColor c = color.toSkColor();
55 return {
56 static_cast<uint8_t>(SkColorGetR(c)),
57 static_cast<uint8_t>(SkColorGetG(c)),
58 static_cast<uint8_t>(SkColorGetB(c)),
59 static_cast<uint8_t>(SkColorGetA(c)),
60 };
61 }
62
to_wg_size(const vello_cpp::WorkgroupSize & src)63 WorkgroupSize to_wg_size(const vello_cpp::WorkgroupSize& src) {
64 return WorkgroupSize(src.x, src.y, src.z);
65 }
66
to_fill_type(SkPathFillType fillType)67 vello_cpp::Fill to_fill_type(SkPathFillType fillType) {
68 // Vello does not provide an encoding for inverse fill types. When Skia uses vello to render
69 // a coverage mask for an inverse fill, it encodes a regular fill and inverts the coverage value
70 // after sampling the mask.
71 switch (fillType) {
72 case SkPathFillType::kWinding:
73 case SkPathFillType::kInverseWinding:
74 return vello_cpp::Fill::NonZero;
75 case SkPathFillType::kEvenOdd:
76 case SkPathFillType::kInverseEvenOdd:
77 return vello_cpp::Fill::EvenOdd;
78 }
79 return vello_cpp::Fill::NonZero;
80 }
81
to_cap_style(SkPaint::Cap cap)82 vello_cpp::CapStyle to_cap_style(SkPaint::Cap cap) {
83 switch (cap) {
84 case SkPaint::Cap::kButt_Cap:
85 return vello_cpp::CapStyle::Butt;
86 case SkPaint::Cap::kRound_Cap:
87 return vello_cpp::CapStyle::Round;
88 case SkPaint::Cap::kSquare_Cap:
89 return vello_cpp::CapStyle::Square;
90 }
91 SkUNREACHABLE;
92 }
93
to_join_style(SkPaint::Join join)94 vello_cpp::JoinStyle to_join_style(SkPaint::Join join) {
95 switch (join) {
96 case SkPaint::Join::kMiter_Join:
97 return vello_cpp::JoinStyle::Miter;
98 case SkPaint::Join::kBevel_Join:
99 return vello_cpp::JoinStyle::Bevel;
100 case SkPaint::Join::kRound_Join:
101 return vello_cpp::JoinStyle::Round;
102 }
103 SkUNREACHABLE;
104 }
105
to_stroke(const SkStrokeRec & style)106 vello_cpp::Stroke to_stroke(const SkStrokeRec& style) {
107 return vello_cpp::Stroke{
108 /*width=*/style.getWidth(),
109 /*miter_limit=*/style.getMiter(),
110 /*cap*/ to_cap_style(style.getCap()),
111 /*join*/ to_join_style(style.getJoin()),
112 };
113 }
114
115 class PathIter : public vello_cpp::PathIterator {
116 public:
PathIter(const SkPath & path,const Transform & t)117 PathIter(const SkPath& path, const Transform& t)
118 : fIterate(path), fIter(fIterate.begin()), fTransform(t) {}
119
next_element(vello_cpp::PathElement * outElem)120 bool next_element(vello_cpp::PathElement* outElem) override {
121 if (fConicQuadIdx < fConicConverter.countQuads()) {
122 SkASSERT(fConicQuads != nullptr);
123 outElem->verb = vello_cpp::PathVerb::QuadTo;
124 int pointIdx = fConicQuadIdx * 2;
125 outElem->points[0] = to_vello_point(fConicQuads[pointIdx]);
126 outElem->points[1] = to_vello_point(fConicQuads[pointIdx + 1]);
127 outElem->points[2] = to_vello_point(fConicQuads[pointIdx + 2]);
128 fConicQuadIdx++;
129 return true;
130 }
131
132 if (fIter == fIterate.end()) {
133 return false;
134 }
135
136 SkASSERT(outElem);
137 auto [verb, points, weights] = *fIter;
138 fIter++;
139
140 switch (verb) {
141 case SkPathVerb::kMove:
142 outElem->verb = vello_cpp::PathVerb::MoveTo;
143 outElem->points[0] = to_vello_point(points[0]);
144 break;
145 case SkPathVerb::kLine:
146 outElem->verb = vello_cpp::PathVerb::LineTo;
147 outElem->points[0] = to_vello_point(points[0]);
148 outElem->points[1] = to_vello_point(points[1]);
149 break;
150 case SkPathVerb::kConic:
151 // The vello encoding API doesn't handle conic sections. Approximate it with
152 // quadratic Béziers.
153 SkASSERT(fConicQuadIdx >= fConicConverter.countQuads()); // No other conic->quad
154 // conversions should be
155 // in progress
156 fConicQuads = fConicConverter.computeQuads(
157 points, *weights, 0.25 / fTransform.maxScaleFactor());
158 outElem->verb = vello_cpp::PathVerb::QuadTo;
159 outElem->points[0] = to_vello_point(fConicQuads[0]);
160 outElem->points[1] = to_vello_point(fConicQuads[1]);
161 outElem->points[2] = to_vello_point(fConicQuads[2]);
162
163 // The next call to `next_element` will yield the next quad in the list (at index 1)
164 // if `fConicConverter` contains more than 1 quad.
165 fConicQuadIdx = 1;
166 break;
167 case SkPathVerb::kQuad:
168 outElem->verb = vello_cpp::PathVerb::QuadTo;
169 outElem->points[0] = to_vello_point(points[0]);
170 outElem->points[1] = to_vello_point(points[1]);
171 outElem->points[2] = to_vello_point(points[2]);
172 break;
173 case SkPathVerb::kCubic:
174 outElem->verb = vello_cpp::PathVerb::CurveTo;
175 outElem->points[0] = to_vello_point(points[0]);
176 outElem->points[1] = to_vello_point(points[1]);
177 outElem->points[2] = to_vello_point(points[2]);
178 outElem->points[3] = to_vello_point(points[3]);
179 break;
180 case SkPathVerb::kClose:
181 outElem->verb = vello_cpp::PathVerb::Close;
182 break;
183 }
184
185 return true;
186 }
187
188 private:
189 SkPathPriv::Iterate fIterate;
190 SkPathPriv::RangeIter fIter;
191
192 // Variables used to track conic to quadratic spline conversion. `fTransform` is used to
193 // determine the subpixel error tolerance in device coordinate space.
194 const Transform& fTransform;
195 SkAutoConicToQuads fConicConverter;
196 const SkPoint* fConicQuads = nullptr;
197 int fConicQuadIdx = 0;
198 };
199
200 } // namespace
201
VelloScene()202 VelloScene::VelloScene() : fEncoding(vello_cpp::new_encoding()) {}
203
reset()204 void VelloScene::reset() {
205 fEncoding->reset();
206 }
207
solidFill(const SkPath & shape,const SkColor4f & fillColor,const SkPathFillType fillType,const Transform & t)208 void VelloScene::solidFill(const SkPath& shape,
209 const SkColor4f& fillColor,
210 const SkPathFillType fillType,
211 const Transform& t) {
212 PathIter iter(shape, t);
213 fEncoding->fill(to_fill_type(fillType),
214 to_vello_affine(t),
215 {vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}},
216 iter);
217 }
218
solidStroke(const SkPath & shape,const SkColor4f & fillColor,const SkStrokeRec & style,const Transform & t)219 void VelloScene::solidStroke(const SkPath& shape,
220 const SkColor4f& fillColor,
221 const SkStrokeRec& style,
222 const Transform& t) {
223 // TODO: Obtain dashing pattern here and let Vello handle dashing on the CPU while
224 // encoding the path?
225 PathIter iter(shape, t);
226 vello_cpp::Brush brush{vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}};
227 fEncoding->stroke(to_stroke(style), to_vello_affine(t), brush, iter);
228 }
229
pushClipLayer(const SkPath & shape,const Transform & t)230 void VelloScene::pushClipLayer(const SkPath& shape, const Transform& t) {
231 PathIter iter(shape, t);
232 fEncoding->begin_clip(to_vello_affine(t), iter);
233 SkDEBUGCODE(fLayers++;)
234 }
235
popClipLayer()236 void VelloScene::popClipLayer() {
237 SkASSERT(fLayers > 0);
238 fEncoding->end_clip();
239 SkDEBUGCODE(fLayers--;)
240 }
241
append(const VelloScene & other)242 void VelloScene::append(const VelloScene& other) {
243 fEncoding->append(*other.fEncoding);
244 }
245
VelloRenderer(const Caps * caps)246 VelloRenderer::VelloRenderer(const Caps* caps) {
247 if (ComputeShaderCoverageMaskTargetFormat(caps) == kAlpha_8_SkColorType) {
248 fFineArea = std::make_unique<VelloFineAreaAlpha8Step>();
249 fFineMsaa16 = std::make_unique<VelloFineMsaa16Alpha8Step>();
250 fFineMsaa8 = std::make_unique<VelloFineMsaa8Alpha8Step>();
251 } else {
252 fFineArea = std::make_unique<VelloFineAreaStep>();
253 fFineMsaa16 = std::make_unique<VelloFineMsaa16Step>();
254 fFineMsaa8 = std::make_unique<VelloFineMsaa8Step>();
255 }
256 }
257
258 VelloRenderer::~VelloRenderer() = default;
259
renderScene(const RenderParams & params,const VelloScene & scene,sk_sp<TextureProxy> target,Recorder * recorder) const260 std::unique_ptr<DispatchGroup> VelloRenderer::renderScene(const RenderParams& params,
261 const VelloScene& scene,
262 sk_sp<TextureProxy> target,
263 Recorder* recorder) const {
264 TRACE_EVENT0("skia.gpu", TRACE_FUNC);
265 SkASSERT(target);
266
267 if (scene.fEncoding->is_empty()) {
268 return nullptr;
269 }
270
271 if (params.fWidth == 0 || params.fHeight == 0) {
272 return nullptr;
273 }
274
275 // TODO: validate that the pixel format matches the pipeline layout.
276 // Clamp the draw region to the target texture dimensions.
277 const SkISize dims = target->dimensions();
278 if (dims.isEmpty() || dims.fWidth < 0 || dims.fHeight < 0) {
279 SKGPU_LOG_W("VelloRenderer: cannot render to an empty target");
280 return nullptr;
281 }
282
283 SkASSERT(scene.fLayers == 0); // Begin/end clips must be matched.
284 auto config = scene.fEncoding->prepare_render(
285 std::min(params.fWidth, static_cast<uint32_t>(dims.fWidth)),
286 std::min(params.fHeight, static_cast<uint32_t>(dims.fHeight)),
287 to_vello_color(params.fBaseColor));
288 auto dispatchInfo = config->workgroup_counts();
289 auto bufferSizes = config->buffer_sizes();
290
291 DispatchGroup::Builder builder(recorder);
292
293 // In total there are 25 resources that are used across the full pipeline stages. The sizes of
294 // these resources depend on the encoded scene. We allocate all of them and assign them
295 // directly to the builder here instead of delegating the logic to the ComputeSteps.
296 DrawBufferManager* bufMgr = recorder->priv().drawBufferManager();
297
298 size_t uboSize = config->config_uniform_buffer_size();
299 auto [uboPtr, configBuf] = bufMgr->getUniformPointer(uboSize);
300 if (!uboPtr || !config->write_config_uniform_buffer(to_slice(uboPtr, uboSize))) {
301 return nullptr;
302 }
303
304 size_t sceneSize = config->scene_buffer_size();
305 auto [scenePtr, sceneBuf] = bufMgr->getStoragePointer(sceneSize);
306 if (!scenePtr || !config->write_scene_buffer(to_slice(scenePtr, sceneSize))) {
307 return nullptr;
308 }
309
310 // TODO(b/285189802): The default sizes for the bump buffers (~97MB) exceed Graphite's resource
311 // budget if multiple passes are necessary per frame (250MB, see ResouceCache.h). We apply a
312 // crude size reduction here which seems to be enough for a 4k x 4k atlas render for the GMs
313 // that we have tested. The numbers below are able to render GM_longpathdash with CPU-side
314 // stroke expansion.
315 //
316 // We need to come up with a better approach to accurately predict the sizes for these buffers
317 // based on the scene encoding and our resource budget. It should be possible to build a
318 // conservative estimate using the total number of path verbs, some heuristic based on the verb
319 // and the path's transform, and the total number of tiles.
320 //
321 // The following numbers amount to ~48MB
322 const size_t lines_size = bufferSizes.lines;
323 const size_t bin_data_size = bufferSizes.bin_data;
324 const size_t tiles_size = bufferSizes.tiles;
325 const size_t segments_size = bufferSizes.segments;
326 const size_t seg_counts_size = bufferSizes.seg_counts;
327 const size_t ptcl_size = bufferSizes.ptcl;
328
329 // See the comments in VelloComputeSteps.h for an explanation of the logic here.
330
331 builder.assignSharedBuffer(configBuf, kVelloSlot_ConfigUniform);
332 builder.assignSharedBuffer(sceneBuf, kVelloSlot_Scene);
333
334 // Buffers get cleared ahead of the entire DispatchGroup. Allocate the bump buffer early to
335 // avoid a potentially recycled (and prematurely cleared) scratch buffer.
336 ScratchBuffer bump = bufMgr->getScratchStorage(bufferSizes.bump_alloc);
337 builder.assignSharedBuffer(new_scratch_slice(bump), kVelloSlot_BumpAlloc, ClearBuffer::kYes);
338
339 // path_reduce
340 ScratchBuffer tagmonoids = bufMgr->getScratchStorage(bufferSizes.path_monoids);
341 {
342 // This can be immediately returned after input processing.
343 ScratchBuffer pathtagReduceOutput = bufMgr->getScratchStorage(bufferSizes.path_reduced);
344 builder.assignSharedBuffer(new_scratch_slice(pathtagReduceOutput),
345 kVelloSlot_PathtagReduceOutput);
346 builder.assignSharedBuffer(new_scratch_slice(tagmonoids), kVelloSlot_TagMonoid);
347 builder.appendStep(&fPathtagReduce, to_wg_size(dispatchInfo.path_reduce));
348
349 // If the input is too large to be fully processed by a single workgroup then a second
350 // reduce step and two scan steps are necessary. Otherwise one reduce+scan pair is
351 // sufficient.
352 //
353 // In either case, the result is `tagmonoids`.
354 if (dispatchInfo.use_large_path_scan) {
355 ScratchBuffer reduced2 = bufMgr->getScratchStorage(bufferSizes.path_reduced2);
356 ScratchBuffer reducedScan = bufMgr->getScratchStorage(bufferSizes.path_reduced_scan);
357
358 builder.assignSharedBuffer(new_scratch_slice(reduced2),
359 kVelloSlot_LargePathtagReduceSecondPassOutput);
360 builder.assignSharedBuffer(new_scratch_slice(reducedScan),
361 kVelloSlot_LargePathtagScanFirstPassOutput);
362
363 builder.appendStep(&fPathtagReduce2, to_wg_size(dispatchInfo.path_reduce2));
364 builder.appendStep(&fPathtagScan1, to_wg_size(dispatchInfo.path_scan1));
365 builder.appendStep(&fPathtagScanLarge, to_wg_size(dispatchInfo.path_scan));
366 } else {
367 builder.appendStep(&fPathtagScanSmall, to_wg_size(dispatchInfo.path_scan));
368 }
369 }
370
371 // bbox_clear
372 ScratchBuffer pathBboxes = bufMgr->getScratchStorage(bufferSizes.path_bboxes);
373 builder.assignSharedBuffer(new_scratch_slice(pathBboxes), kVelloSlot_PathBBoxes);
374 builder.appendStep(&fBboxClear, to_wg_size(dispatchInfo.bbox_clear));
375
376 // flatten
377 ScratchBuffer lines = bufMgr->getScratchStorage(lines_size);
378 builder.assignSharedBuffer(new_scratch_slice(lines), kVelloSlot_Lines);
379 builder.appendStep(&fFlatten, to_wg_size(dispatchInfo.flatten));
380
381 tagmonoids.returnToPool();
382
383 // draw_reduce
384 ScratchBuffer drawReduced = bufMgr->getScratchStorage(bufferSizes.draw_reduced);
385 builder.assignSharedBuffer(new_scratch_slice(drawReduced), kVelloSlot_DrawReduceOutput);
386 builder.appendStep(&fDrawReduce, to_wg_size(dispatchInfo.draw_reduce));
387
388 // draw_leaf
389 ScratchBuffer drawMonoids = bufMgr->getScratchStorage(bufferSizes.draw_monoids);
390 ScratchBuffer binData = bufMgr->getScratchStorage(bin_data_size);
391 // A clip input buffer must still get bound even if the encoding doesn't contain any clips
392 ScratchBuffer clipInput = bufMgr->getScratchStorage(bufferSizes.clip_inps);
393 builder.assignSharedBuffer(new_scratch_slice(drawMonoids), kVelloSlot_DrawMonoid);
394 builder.assignSharedBuffer(new_scratch_slice(binData), kVelloSlot_InfoBinData);
395 builder.assignSharedBuffer(new_scratch_slice(clipInput), kVelloSlot_ClipInput);
396 builder.appendStep(&fDrawLeaf, to_wg_size(dispatchInfo.draw_leaf));
397
398 drawReduced.returnToPool();
399
400 // clip_reduce, clip_leaf
401 // The clip bbox buffer is always an input to the binning stage, even when the encoding doesn't
402 // contain any clips
403 ScratchBuffer clipBboxes = bufMgr->getScratchStorage(bufferSizes.clip_bboxes);
404 builder.assignSharedBuffer(new_scratch_slice(clipBboxes), kVelloSlot_ClipBBoxes);
405 WorkgroupSize clipReduceWgCount = to_wg_size(dispatchInfo.clip_reduce);
406 WorkgroupSize clipLeafWgCount = to_wg_size(dispatchInfo.clip_leaf);
407 bool doClipReduce = clipReduceWgCount.scalarSize() > 0u;
408 bool doClipLeaf = clipLeafWgCount.scalarSize() > 0u;
409 if (doClipReduce || doClipLeaf) {
410 ScratchBuffer clipBic = bufMgr->getScratchStorage(bufferSizes.clip_bics);
411 ScratchBuffer clipEls = bufMgr->getScratchStorage(bufferSizes.clip_els);
412 builder.assignSharedBuffer(new_scratch_slice(clipBic), kVelloSlot_ClipBicyclic);
413 builder.assignSharedBuffer(new_scratch_slice(clipEls), kVelloSlot_ClipElement);
414 if (doClipReduce) {
415 builder.appendStep(&fClipReduce, clipReduceWgCount);
416 }
417 if (doClipLeaf) {
418 builder.appendStep(&fClipLeaf, clipLeafWgCount);
419 }
420 }
421
422 clipInput.returnToPool();
423
424 // binning
425 ScratchBuffer drawBboxes = bufMgr->getScratchStorage(bufferSizes.draw_bboxes);
426 ScratchBuffer binHeaders = bufMgr->getScratchStorage(bufferSizes.bin_headers);
427 builder.assignSharedBuffer(new_scratch_slice(drawBboxes), kVelloSlot_DrawBBoxes);
428 builder.assignSharedBuffer(new_scratch_slice(binHeaders), kVelloSlot_BinHeader);
429 builder.appendStep(&fBinning, to_wg_size(dispatchInfo.binning));
430
431 pathBboxes.returnToPool();
432 clipBboxes.returnToPool();
433
434 // tile_alloc
435 ScratchBuffer paths = bufMgr->getScratchStorage(bufferSizes.paths);
436 ScratchBuffer tiles = bufMgr->getScratchStorage(tiles_size);
437 builder.assignSharedBuffer(new_scratch_slice(paths), kVelloSlot_Path);
438 builder.assignSharedBuffer(new_scratch_slice(tiles), kVelloSlot_Tile);
439 builder.appendStep(&fTileAlloc, to_wg_size(dispatchInfo.tile_alloc));
440
441 drawBboxes.returnToPool();
442
443 // path_count_setup
444 auto indirectCountBuffer = new_indirect_slice(bufMgr, bufferSizes.indirect_count);
445 builder.assignSharedBuffer(indirectCountBuffer, kVelloSlot_IndirectCount);
446 builder.appendStep(&fPathCountSetup, to_wg_size(dispatchInfo.path_count_setup));
447
448 // Rasterization stage scratch buffers.
449 ScratchBuffer seg_counts = bufMgr->getScratchStorage(seg_counts_size);
450 ScratchBuffer segments = bufMgr->getScratchStorage(segments_size);
451 ScratchBuffer ptcl = bufMgr->getScratchStorage(ptcl_size);
452
453 // path_count
454 builder.assignSharedBuffer(new_scratch_slice(seg_counts), kVelloSlot_SegmentCounts);
455 builder.appendStepIndirect(&fPathCount, indirectCountBuffer);
456
457 // backdrop
458 builder.appendStep(&fBackdrop, to_wg_size(dispatchInfo.backdrop));
459
460 // coarse
461 builder.assignSharedBuffer(new_scratch_slice(ptcl), kVelloSlot_PTCL);
462 builder.appendStep(&fCoarse, to_wg_size(dispatchInfo.coarse));
463
464 // path_tiling_setup
465 builder.appendStep(&fPathTilingSetup, to_wg_size(dispatchInfo.path_tiling_setup));
466
467 // path_tiling
468 builder.assignSharedBuffer(new_scratch_slice(segments), kVelloSlot_Segments);
469 builder.appendStepIndirect(&fPathTiling, indirectCountBuffer);
470
471 // fine
472 builder.assignSharedTexture(std::move(target), kVelloSlot_OutputImage);
473 const ComputeStep* fineVariant = nullptr;
474 switch (params.fAaConfig) {
475 case VelloAaConfig::kAnalyticArea:
476 fineVariant = fFineArea.get();
477 break;
478 case VelloAaConfig::kMSAA16:
479 fineVariant = fFineMsaa16.get();
480 break;
481 case VelloAaConfig::kMSAA8:
482 fineVariant = fFineMsaa8.get();
483 break;
484 }
485 SkASSERT(fineVariant != nullptr);
486 builder.appendStep(fineVariant, to_wg_size(dispatchInfo.fine));
487
488 return builder.finalize();
489 }
490
491 } // namespace skgpu::graphite
492