/* * Copyright 2023 Google LLC * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "src/gpu/graphite/compute/VelloRenderer.h" #include "include/core/SkPath.h" #include "include/core/SkTypes.h" #include "include/gpu/graphite/Recorder.h" #include "src/core/SkGeometry.h" #include "src/core/SkPathPriv.h" #include "src/core/SkTraceEvent.h" #include "src/gpu/graphite/BufferManager.h" #include "src/gpu/graphite/Caps.h" #include "src/gpu/graphite/DrawParams.h" #include "src/gpu/graphite/Log.h" #include "src/gpu/graphite/PipelineData.h" #include "src/gpu/graphite/RecorderPriv.h" #include "src/gpu/graphite/TextureProxy.h" #include "src/gpu/graphite/TextureUtils.h" #include "src/gpu/graphite/UniformManager.h" #include "src/gpu/graphite/compute/DispatchGroup.h" #include namespace skgpu::graphite { namespace { BindBufferInfo new_scratch_slice(ScratchBuffer& scratch) { size_t size = scratch.size(); // Use the whole buffer. return scratch.suballocate(size); } BindBufferInfo new_indirect_slice(DrawBufferManager* mgr, size_t size) { return mgr->getIndirectStorage(size, ClearBuffer::kYes); } ::rust::Slice to_slice(void* ptr, size_t size) { return {static_cast(ptr), size}; } vello_cpp::Affine to_vello_affine(const SkMatrix& m) { // Vello currently doesn't support perspective scaling and the encoding only accepts a 2x3 // affine transform matrix. return {{m.get(0), m.get(3), m.get(1), m.get(4), m.get(2), m.get(5)}}; } vello_cpp::Point to_vello_point(const SkPoint& p) { return {p.x(), p.y()}; } vello_cpp::Color to_vello_color(const SkColor4f& color) { SkColor c = color.toSkColor(); return { static_cast(SkColorGetR(c)), static_cast(SkColorGetG(c)), static_cast(SkColorGetB(c)), static_cast(SkColorGetA(c)), }; } WorkgroupSize to_wg_size(const vello_cpp::WorkgroupSize& src) { return WorkgroupSize(src.x, src.y, src.z); } vello_cpp::Fill to_fill_type(SkPathFillType fillType) { // Vello does not provide an encoding for inverse fill types. When Skia uses vello to render // a coverage mask for an inverse fill, it encodes a regular fill and inverts the coverage value // after sampling the mask. switch (fillType) { case SkPathFillType::kWinding: case SkPathFillType::kInverseWinding: return vello_cpp::Fill::NonZero; case SkPathFillType::kEvenOdd: case SkPathFillType::kInverseEvenOdd: return vello_cpp::Fill::EvenOdd; } return vello_cpp::Fill::NonZero; } vello_cpp::CapStyle to_cap_style(SkPaint::Cap cap) { switch (cap) { case SkPaint::Cap::kButt_Cap: return vello_cpp::CapStyle::Butt; case SkPaint::Cap::kRound_Cap: return vello_cpp::CapStyle::Round; case SkPaint::Cap::kSquare_Cap: return vello_cpp::CapStyle::Square; } SkUNREACHABLE; } vello_cpp::JoinStyle to_join_style(SkPaint::Join join) { switch (join) { case SkPaint::Join::kMiter_Join: return vello_cpp::JoinStyle::Miter; case SkPaint::Join::kBevel_Join: return vello_cpp::JoinStyle::Bevel; case SkPaint::Join::kRound_Join: return vello_cpp::JoinStyle::Round; } SkUNREACHABLE; } vello_cpp::Stroke to_stroke(const SkStrokeRec& style) { return vello_cpp::Stroke{ /*width=*/style.getWidth(), /*miter_limit=*/style.getMiter(), /*cap*/ to_cap_style(style.getCap()), /*join*/ to_join_style(style.getJoin()), }; } class PathIter : public vello_cpp::PathIterator { public: PathIter(const SkPath& path, const Transform& t) : fIterate(path), fIter(fIterate.begin()), fTransform(t) {} bool next_element(vello_cpp::PathElement* outElem) override { if (fConicQuadIdx < fConicConverter.countQuads()) { SkASSERT(fConicQuads != nullptr); outElem->verb = vello_cpp::PathVerb::QuadTo; int pointIdx = fConicQuadIdx * 2; outElem->points[0] = to_vello_point(fConicQuads[pointIdx]); outElem->points[1] = to_vello_point(fConicQuads[pointIdx + 1]); outElem->points[2] = to_vello_point(fConicQuads[pointIdx + 2]); fConicQuadIdx++; return true; } if (fIter == fIterate.end()) { return false; } SkASSERT(outElem); auto [verb, points, weights] = *fIter; fIter++; switch (verb) { case SkPathVerb::kMove: outElem->verb = vello_cpp::PathVerb::MoveTo; outElem->points[0] = to_vello_point(points[0]); break; case SkPathVerb::kLine: outElem->verb = vello_cpp::PathVerb::LineTo; outElem->points[0] = to_vello_point(points[0]); outElem->points[1] = to_vello_point(points[1]); break; case SkPathVerb::kConic: // The vello encoding API doesn't handle conic sections. Approximate it with // quadratic Béziers. SkASSERT(fConicQuadIdx >= fConicConverter.countQuads()); // No other conic->quad // conversions should be // in progress fConicQuads = fConicConverter.computeQuads( points, *weights, 0.25 / fTransform.maxScaleFactor()); outElem->verb = vello_cpp::PathVerb::QuadTo; outElem->points[0] = to_vello_point(fConicQuads[0]); outElem->points[1] = to_vello_point(fConicQuads[1]); outElem->points[2] = to_vello_point(fConicQuads[2]); // The next call to `next_element` will yield the next quad in the list (at index 1) // if `fConicConverter` contains more than 1 quad. fConicQuadIdx = 1; break; case SkPathVerb::kQuad: outElem->verb = vello_cpp::PathVerb::QuadTo; outElem->points[0] = to_vello_point(points[0]); outElem->points[1] = to_vello_point(points[1]); outElem->points[2] = to_vello_point(points[2]); break; case SkPathVerb::kCubic: outElem->verb = vello_cpp::PathVerb::CurveTo; outElem->points[0] = to_vello_point(points[0]); outElem->points[1] = to_vello_point(points[1]); outElem->points[2] = to_vello_point(points[2]); outElem->points[3] = to_vello_point(points[3]); break; case SkPathVerb::kClose: outElem->verb = vello_cpp::PathVerb::Close; break; } return true; } private: SkPathPriv::Iterate fIterate; SkPathPriv::RangeIter fIter; // Variables used to track conic to quadratic spline conversion. `fTransform` is used to // determine the subpixel error tolerance in device coordinate space. const Transform& fTransform; SkAutoConicToQuads fConicConverter; const SkPoint* fConicQuads = nullptr; int fConicQuadIdx = 0; }; } // namespace VelloScene::VelloScene() : fEncoding(vello_cpp::new_encoding()) {} void VelloScene::reset() { fEncoding->reset(); } void VelloScene::solidFill(const SkPath& shape, const SkColor4f& fillColor, const SkPathFillType fillType, const Transform& t) { PathIter iter(shape, t); fEncoding->fill(to_fill_type(fillType), to_vello_affine(t), {vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}}, iter); } void VelloScene::solidStroke(const SkPath& shape, const SkColor4f& fillColor, const SkStrokeRec& style, const Transform& t) { // TODO: Obtain dashing pattern here and let Vello handle dashing on the CPU while // encoding the path? PathIter iter(shape, t); vello_cpp::Brush brush{vello_cpp::BrushKind::Solid, {to_vello_color(fillColor)}}; fEncoding->stroke(to_stroke(style), to_vello_affine(t), brush, iter); } void VelloScene::pushClipLayer(const SkPath& shape, const Transform& t) { PathIter iter(shape, t); fEncoding->begin_clip(to_vello_affine(t), iter); SkDEBUGCODE(fLayers++;) } void VelloScene::popClipLayer() { SkASSERT(fLayers > 0); fEncoding->end_clip(); SkDEBUGCODE(fLayers--;) } void VelloScene::append(const VelloScene& other) { fEncoding->append(*other.fEncoding); } VelloRenderer::VelloRenderer(const Caps* caps) { if (ComputeShaderCoverageMaskTargetFormat(caps) == kAlpha_8_SkColorType) { fFineArea = std::make_unique(); fFineMsaa16 = std::make_unique(); fFineMsaa8 = std::make_unique(); } else { fFineArea = std::make_unique(); fFineMsaa16 = std::make_unique(); fFineMsaa8 = std::make_unique(); } } VelloRenderer::~VelloRenderer() = default; std::unique_ptr VelloRenderer::renderScene(const RenderParams& params, const VelloScene& scene, sk_sp target, Recorder* recorder) const { TRACE_EVENT0("skia.gpu", TRACE_FUNC); SkASSERT(target); if (scene.fEncoding->is_empty()) { return nullptr; } if (params.fWidth == 0 || params.fHeight == 0) { return nullptr; } // TODO: validate that the pixel format matches the pipeline layout. // Clamp the draw region to the target texture dimensions. const SkISize dims = target->dimensions(); if (dims.isEmpty() || dims.fWidth < 0 || dims.fHeight < 0) { SKGPU_LOG_W("VelloRenderer: cannot render to an empty target"); return nullptr; } SkASSERT(scene.fLayers == 0); // Begin/end clips must be matched. auto config = scene.fEncoding->prepare_render( std::min(params.fWidth, static_cast(dims.fWidth)), std::min(params.fHeight, static_cast(dims.fHeight)), to_vello_color(params.fBaseColor)); auto dispatchInfo = config->workgroup_counts(); auto bufferSizes = config->buffer_sizes(); DispatchGroup::Builder builder(recorder); // In total there are 25 resources that are used across the full pipeline stages. The sizes of // these resources depend on the encoded scene. We allocate all of them and assign them // directly to the builder here instead of delegating the logic to the ComputeSteps. DrawBufferManager* bufMgr = recorder->priv().drawBufferManager(); size_t uboSize = config->config_uniform_buffer_size(); auto [uboPtr, configBuf] = bufMgr->getUniformPointer(uboSize); if (!uboPtr || !config->write_config_uniform_buffer(to_slice(uboPtr, uboSize))) { return nullptr; } size_t sceneSize = config->scene_buffer_size(); auto [scenePtr, sceneBuf] = bufMgr->getStoragePointer(sceneSize); if (!scenePtr || !config->write_scene_buffer(to_slice(scenePtr, sceneSize))) { return nullptr; } // TODO(b/285189802): The default sizes for the bump buffers (~97MB) exceed Graphite's resource // budget if multiple passes are necessary per frame (250MB, see ResouceCache.h). We apply a // crude size reduction here which seems to be enough for a 4k x 4k atlas render for the GMs // that we have tested. The numbers below are able to render GM_longpathdash with CPU-side // stroke expansion. // // We need to come up with a better approach to accurately predict the sizes for these buffers // based on the scene encoding and our resource budget. It should be possible to build a // conservative estimate using the total number of path verbs, some heuristic based on the verb // and the path's transform, and the total number of tiles. // // The following numbers amount to ~48MB const size_t lines_size = bufferSizes.lines; const size_t bin_data_size = bufferSizes.bin_data; const size_t tiles_size = bufferSizes.tiles; const size_t segments_size = bufferSizes.segments; const size_t seg_counts_size = bufferSizes.seg_counts; const size_t ptcl_size = bufferSizes.ptcl; // See the comments in VelloComputeSteps.h for an explanation of the logic here. builder.assignSharedBuffer(configBuf, kVelloSlot_ConfigUniform); builder.assignSharedBuffer(sceneBuf, kVelloSlot_Scene); // Buffers get cleared ahead of the entire DispatchGroup. Allocate the bump buffer early to // avoid a potentially recycled (and prematurely cleared) scratch buffer. ScratchBuffer bump = bufMgr->getScratchStorage(bufferSizes.bump_alloc); builder.assignSharedBuffer(new_scratch_slice(bump), kVelloSlot_BumpAlloc, ClearBuffer::kYes); // path_reduce ScratchBuffer tagmonoids = bufMgr->getScratchStorage(bufferSizes.path_monoids); { // This can be immediately returned after input processing. ScratchBuffer pathtagReduceOutput = bufMgr->getScratchStorage(bufferSizes.path_reduced); builder.assignSharedBuffer(new_scratch_slice(pathtagReduceOutput), kVelloSlot_PathtagReduceOutput); builder.assignSharedBuffer(new_scratch_slice(tagmonoids), kVelloSlot_TagMonoid); builder.appendStep(&fPathtagReduce, to_wg_size(dispatchInfo.path_reduce)); // If the input is too large to be fully processed by a single workgroup then a second // reduce step and two scan steps are necessary. Otherwise one reduce+scan pair is // sufficient. // // In either case, the result is `tagmonoids`. if (dispatchInfo.use_large_path_scan) { ScratchBuffer reduced2 = bufMgr->getScratchStorage(bufferSizes.path_reduced2); ScratchBuffer reducedScan = bufMgr->getScratchStorage(bufferSizes.path_reduced_scan); builder.assignSharedBuffer(new_scratch_slice(reduced2), kVelloSlot_LargePathtagReduceSecondPassOutput); builder.assignSharedBuffer(new_scratch_slice(reducedScan), kVelloSlot_LargePathtagScanFirstPassOutput); builder.appendStep(&fPathtagReduce2, to_wg_size(dispatchInfo.path_reduce2)); builder.appendStep(&fPathtagScan1, to_wg_size(dispatchInfo.path_scan1)); builder.appendStep(&fPathtagScanLarge, to_wg_size(dispatchInfo.path_scan)); } else { builder.appendStep(&fPathtagScanSmall, to_wg_size(dispatchInfo.path_scan)); } } // bbox_clear ScratchBuffer pathBboxes = bufMgr->getScratchStorage(bufferSizes.path_bboxes); builder.assignSharedBuffer(new_scratch_slice(pathBboxes), kVelloSlot_PathBBoxes); builder.appendStep(&fBboxClear, to_wg_size(dispatchInfo.bbox_clear)); // flatten ScratchBuffer lines = bufMgr->getScratchStorage(lines_size); builder.assignSharedBuffer(new_scratch_slice(lines), kVelloSlot_Lines); builder.appendStep(&fFlatten, to_wg_size(dispatchInfo.flatten)); tagmonoids.returnToPool(); // draw_reduce ScratchBuffer drawReduced = bufMgr->getScratchStorage(bufferSizes.draw_reduced); builder.assignSharedBuffer(new_scratch_slice(drawReduced), kVelloSlot_DrawReduceOutput); builder.appendStep(&fDrawReduce, to_wg_size(dispatchInfo.draw_reduce)); // draw_leaf ScratchBuffer drawMonoids = bufMgr->getScratchStorage(bufferSizes.draw_monoids); ScratchBuffer binData = bufMgr->getScratchStorage(bin_data_size); // A clip input buffer must still get bound even if the encoding doesn't contain any clips ScratchBuffer clipInput = bufMgr->getScratchStorage(bufferSizes.clip_inps); builder.assignSharedBuffer(new_scratch_slice(drawMonoids), kVelloSlot_DrawMonoid); builder.assignSharedBuffer(new_scratch_slice(binData), kVelloSlot_InfoBinData); builder.assignSharedBuffer(new_scratch_slice(clipInput), kVelloSlot_ClipInput); builder.appendStep(&fDrawLeaf, to_wg_size(dispatchInfo.draw_leaf)); drawReduced.returnToPool(); // clip_reduce, clip_leaf // The clip bbox buffer is always an input to the binning stage, even when the encoding doesn't // contain any clips ScratchBuffer clipBboxes = bufMgr->getScratchStorage(bufferSizes.clip_bboxes); builder.assignSharedBuffer(new_scratch_slice(clipBboxes), kVelloSlot_ClipBBoxes); WorkgroupSize clipReduceWgCount = to_wg_size(dispatchInfo.clip_reduce); WorkgroupSize clipLeafWgCount = to_wg_size(dispatchInfo.clip_leaf); bool doClipReduce = clipReduceWgCount.scalarSize() > 0u; bool doClipLeaf = clipLeafWgCount.scalarSize() > 0u; if (doClipReduce || doClipLeaf) { ScratchBuffer clipBic = bufMgr->getScratchStorage(bufferSizes.clip_bics); ScratchBuffer clipEls = bufMgr->getScratchStorage(bufferSizes.clip_els); builder.assignSharedBuffer(new_scratch_slice(clipBic), kVelloSlot_ClipBicyclic); builder.assignSharedBuffer(new_scratch_slice(clipEls), kVelloSlot_ClipElement); if (doClipReduce) { builder.appendStep(&fClipReduce, clipReduceWgCount); } if (doClipLeaf) { builder.appendStep(&fClipLeaf, clipLeafWgCount); } } clipInput.returnToPool(); // binning ScratchBuffer drawBboxes = bufMgr->getScratchStorage(bufferSizes.draw_bboxes); ScratchBuffer binHeaders = bufMgr->getScratchStorage(bufferSizes.bin_headers); builder.assignSharedBuffer(new_scratch_slice(drawBboxes), kVelloSlot_DrawBBoxes); builder.assignSharedBuffer(new_scratch_slice(binHeaders), kVelloSlot_BinHeader); builder.appendStep(&fBinning, to_wg_size(dispatchInfo.binning)); pathBboxes.returnToPool(); clipBboxes.returnToPool(); // tile_alloc ScratchBuffer paths = bufMgr->getScratchStorage(bufferSizes.paths); ScratchBuffer tiles = bufMgr->getScratchStorage(tiles_size); builder.assignSharedBuffer(new_scratch_slice(paths), kVelloSlot_Path); builder.assignSharedBuffer(new_scratch_slice(tiles), kVelloSlot_Tile); builder.appendStep(&fTileAlloc, to_wg_size(dispatchInfo.tile_alloc)); drawBboxes.returnToPool(); // path_count_setup auto indirectCountBuffer = new_indirect_slice(bufMgr, bufferSizes.indirect_count); builder.assignSharedBuffer(indirectCountBuffer, kVelloSlot_IndirectCount); builder.appendStep(&fPathCountSetup, to_wg_size(dispatchInfo.path_count_setup)); // Rasterization stage scratch buffers. ScratchBuffer seg_counts = bufMgr->getScratchStorage(seg_counts_size); ScratchBuffer segments = bufMgr->getScratchStorage(segments_size); ScratchBuffer ptcl = bufMgr->getScratchStorage(ptcl_size); // path_count builder.assignSharedBuffer(new_scratch_slice(seg_counts), kVelloSlot_SegmentCounts); builder.appendStepIndirect(&fPathCount, indirectCountBuffer); // backdrop builder.appendStep(&fBackdrop, to_wg_size(dispatchInfo.backdrop)); // coarse builder.assignSharedBuffer(new_scratch_slice(ptcl), kVelloSlot_PTCL); builder.appendStep(&fCoarse, to_wg_size(dispatchInfo.coarse)); // path_tiling_setup builder.appendStep(&fPathTilingSetup, to_wg_size(dispatchInfo.path_tiling_setup)); // path_tiling builder.assignSharedBuffer(new_scratch_slice(segments), kVelloSlot_Segments); builder.appendStepIndirect(&fPathTiling, indirectCountBuffer); // fine builder.assignSharedTexture(std::move(target), kVelloSlot_OutputImage); const ComputeStep* fineVariant = nullptr; switch (params.fAaConfig) { case VelloAaConfig::kAnalyticArea: fineVariant = fFineArea.get(); break; case VelloAaConfig::kMSAA16: fineVariant = fFineMsaa16.get(); break; case VelloAaConfig::kMSAA8: fineVariant = fFineMsaa8.get(); break; } SkASSERT(fineVariant != nullptr); builder.appendStep(fineVariant, to_wg_size(dispatchInfo.fine)); return builder.finalize(); } } // namespace skgpu::graphite