1 // Copyright 2021 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ABSL_STRINGS_INTERNAL_CORD_REP_BTREE_H_
16 #define ABSL_STRINGS_INTERNAL_CORD_REP_BTREE_H_
17
18 #include <cassert>
19 #include <cstdint>
20 #include <iosfwd>
21
22 #include "absl/base/config.h"
23 #include "absl/base/internal/raw_logging.h"
24 #include "absl/base/optimization.h"
25 #include "absl/strings/internal/cord_data_edge.h"
26 #include "absl/strings/internal/cord_internal.h"
27 #include "absl/strings/internal/cord_rep_flat.h"
28 #include "absl/strings/string_view.h"
29 #include "absl/types/span.h"
30
31 namespace absl {
32 ABSL_NAMESPACE_BEGIN
33 namespace cord_internal {
34
35 class CordRepBtreeNavigator;
36
37 // CordRepBtree is as the name implies a btree implementation of a Cordrep tree.
38 // Data is stored at the leaf level only, non leaf nodes contain down pointers
39 // only. Allowed types of data edges are FLAT, EXTERNAL and SUBSTRINGs of FLAT
40 // or EXTERNAL nodes. The implementation allows for data to be added to either
41 // end of the tree only, it does not provide any 'insert' logic. This has the
42 // benefit that we can expect good fill ratios: all nodes except the outer
43 // 'legs' will have 100% fill ratios for trees built using Append/Prepend
44 // methods. Merged trees will typically have a fill ratio well above 50% as in a
45 // similar fashion, one side of the merged tree will typically have a 100% fill
46 // ratio, and the 'open' end will average 50%. All operations are O(log(n)) or
47 // better, and the tree never needs balancing.
48 //
49 // All methods accepting a CordRep* or CordRepBtree* adopt a reference on that
50 // input unless explicitly stated otherwise. All functions returning a CordRep*
51 // or CordRepBtree* instance transfer a reference back to the caller.
52 // Simplified, callers both 'donate' and 'consume' a reference count on each
53 // call, simplifying the API. An example of building a tree:
54 //
55 // CordRepBtree* tree = CordRepBtree::Create(MakeFlat("Hello"));
56 // tree = CordRepBtree::Append(tree, MakeFlat("world"));
57 //
58 // In the above example, all inputs are consumed, making each call affecting
59 // `tree` reference count neutral. The returned `tree` value can be different
60 // from the input if the input is shared with other threads, or if the tree
61 // grows in height, but callers typically never have to concern themselves with
62 // that and trust that all methods DTRT at all times.
63 class CordRepBtree : public CordRep {
64 public:
65 // EdgeType identifies `front` and `back` enum values.
66 // Various implementations in CordRepBtree such as `Add` and `Edge` are
67 // generic and templated on operating on either of the boundary edges.
68 // For more information on the possible edges contained in a CordRepBtree
69 // instance see the documentation for `edges_`.
70 enum class EdgeType { kFront, kBack };
71
72 // Convenience constants into `EdgeType`
73 static constexpr EdgeType kFront = EdgeType::kFront;
74 static constexpr EdgeType kBack = EdgeType::kBack;
75
76 // Maximum number of edges: based on experiments and performance data, we can
77 // pick suitable values resulting in optimum cacheline aligned values. The
78 // preferred values are based on 64-bit systems where we aim to align this
79 // class onto 64 bytes, i.e.: 6 = 64 bytes, 14 = 128 bytes, etc.
80 // TODO(b/192061034): experiment with alternative sizes.
81 static constexpr size_t kMaxCapacity = 6;
82
83 // Reasonable maximum height of the btree. We can expect a fill ratio of at
84 // least 50%: trees are always expanded at the front or back. Concatenating
85 // trees will then typically fold at the top most node, where the lower nodes
86 // are at least at capacity on one side of joined inputs. At a lower fill
87 // rate of 4 edges per node, we have capacity for ~16 million leaf nodes.
88 // We will fail / abort if an application ever exceeds this height, which
89 // should be extremely rare (near impossible) and be an indication of an
90 // application error: we do not assume it reasonable for any application to
91 // operate correctly with such monster trees.
92 // Another compelling reason for the number `12` is that any contextual stack
93 // required for navigation or insertion requires 12 words and 12 bytes, which
94 // fits inside 2 cache lines with some room to spare, and is reasonable as a
95 // local stack variable compared to Cord's current near 400 bytes stack use.
96 // The maximum `height` value of a node is then `kMaxDepth - 1` as node height
97 // values start with a value of 0 for leaf nodes.
98 static constexpr size_t kMaxDepth = 12;
99 // See comments on height() for why this is an int and not a size_t.
100 static constexpr int kMaxHeight = static_cast<int>(kMaxDepth - 1);
101
102 // `Action` defines the action for unwinding changes done at the btree's leaf
103 // level that need to be propagated up to the parent node(s). Each operation
104 // on a node has an effect / action defined as follows:
105 // - kSelf
106 // The operation (add / update, etc) was performed directly on the node as
107 // the node is private to the current thread (i.e.: not shared directly or
108 // indirectly through a refcount > 1). Changes can be propagated directly to
109 // all parent nodes as all parent nodes are also then private to the current
110 // thread.
111 // - kCopied
112 // The operation (add / update, etc) was performed on a copy of the original
113 // node, as the node is (potentially) directly or indirectly shared with
114 // other threads. Changes need to be propagated into the parent nodes where
115 // the old down pointer must be unreffed and replaced with this new copy.
116 // Such changes to parent nodes may themselves require a copy if the parent
117 // node is also shared. A kCopied action can propagate all the way to the
118 // top node where we then must unref the `tree` input provided by the
119 // caller, and return the new copy.
120 // - kPopped
121 // The operation (typically add) could not be satisfied due to insufficient
122 // capacity in the targeted node, and a new 'leg' was created that needs to
123 // be added into the parent node. For example, adding a FLAT inside a leaf
124 // node that is at capacity will create a new leaf node containing that
125 // FLAT, that needs to be 'popped' up the btree. Such 'pop' actions can
126 // cascade up the tree if parent nodes are also at capacity. A 'Popped'
127 // action propagating all the way to the top of the tree will result in
128 // the tree becoming one level higher than the current tree through a final
129 // `CordRepBtree::New(tree, popped)` call, resulting in a new top node
130 // referencing the old tree and the new (fully popped upwards) 'leg'.
131 enum Action { kSelf, kCopied, kPopped };
132
133 // Result of an operation on a node. See the `Action` enum for details.
134 struct OpResult {
135 CordRepBtree* tree;
136 Action action;
137 };
138
139 // Return value of the CopyPrefix and CopySuffix methods which can
140 // return a node or data edge at any height inside the tree.
141 // A height of 0 defines the lowest (leaf) node, a height of -1 identifies
142 // `edge` as being a plain data node: EXTERNAL / FLAT or SUBSTRING thereof.
143 struct CopyResult {
144 CordRep* edge;
145 int height;
146 };
147
148 // Logical position inside a node:
149 // - index: index of the edge.
150 // - n: size or offset value depending on context.
151 struct Position {
152 size_t index;
153 size_t n;
154 };
155
156 // Creates a btree from the given input. Adopts a ref of `rep`.
157 // If the input `rep` is itself a btree, i.e., `IsBtree()`, then this
158 // function immediately returns `rep->btree()`. If the input is a valid data
159 // edge (see IsDataEdge()), then a new leaf node is returned containing `rep`
160 // as the sole data edge. Else, the input is assumed to be a (legacy) concat
161 // tree, and the input is consumed and transformed into a btree().
162 static CordRepBtree* Create(CordRep* rep);
163
164 // Destroys the provided tree. Should only be called by cord internal API's,
165 // typically after a ref_count.Decrement() on the last reference count.
166 static void Destroy(CordRepBtree* tree);
167
168 // Destruction
Delete(CordRepBtree * tree)169 static void Delete(CordRepBtree* tree) { delete tree; }
170
171 // Use CordRep::Unref() as we overload for absl::Span<CordRep* const>.
172 using CordRep::Unref;
173
174 // Unrefs all edges in `edges` which are assumed to be 'likely one'.
175 static void Unref(absl::Span<CordRep* const> edges);
176
177 // Appends / Prepends an existing CordRep instance to this tree.
178 // The below methods accept three types of input:
179 // 1) `rep` is a data node (See `IsDataNode` for valid data edges).
180 // `rep` is appended or prepended to this tree 'as is'.
181 // 2) `rep` is a BTREE.
182 // `rep` is merged into `tree` respecting the Append/Prepend order.
183 // 3) `rep` is some other (legacy) type.
184 // `rep` is converted in place and added to `tree`
185 // Requires `tree` and `rep` to be not null.
186 static CordRepBtree* Append(CordRepBtree* tree, CordRep* rep);
187 static CordRepBtree* Prepend(CordRepBtree* tree, CordRep* rep);
188
189 // Append/Prepend the data in `data` to this tree.
190 // The `extra` parameter defines how much extra capacity should be allocated
191 // for any additional FLAT being allocated. This is an optimization hint from
192 // the caller. For example, a caller may need to add 2 string_views of data
193 // "abc" and "defghi" which are not consecutive. The caller can in this case
194 // invoke `AddData(tree, "abc", 6)`, and any newly added flat is allocated
195 // where possible with at least 6 bytes of extra capacity beyond `length`.
196 // This helps avoiding data getting fragmented over multiple flats.
197 // There is no limit on the size of `data`. If `data` can not be stored inside
198 // a single flat, then the function will iteratively add flats until all data
199 // has been consumed and appended or prepended to the tree.
200 static CordRepBtree* Append(CordRepBtree* tree, string_view data,
201 size_t extra = 0);
202 static CordRepBtree* Prepend(CordRepBtree* tree, string_view data,
203 size_t extra = 0);
204
205 // Returns a new tree, containing `n` bytes of data from this instance
206 // starting at offset `offset`. Where possible, the returned tree shares
207 // (re-uses) data edges and nodes with this instance to minimize the
208 // combined memory footprint of both trees.
209 // Requires `offset + n <= length`. Returns `nullptr` if `n` is zero.
210 CordRep* SubTree(size_t offset, size_t n);
211
212 // Removes `n` trailing bytes from `tree`, and returns the resulting tree
213 // or data edge. Returns `tree` if n is zero, and nullptr if n == length.
214 // This function is logically identical to:
215 // result = tree->SubTree(0, tree->length - n);
216 // Unref(tree);
217 // return result;
218 // However, the actual implementation will as much as possible perform 'in
219 // place' modifications on the tree on all nodes and edges that are mutable.
220 // For example, in a fully privately owned tree with the last edge being a
221 // flat of length 12, RemoveSuffix(1) will simply set the length of that data
222 // edge to 11, and reduce the length of all nodes on the edge path by 1.
223 static CordRep* RemoveSuffix(CordRepBtree* tree, size_t n);
224
225 // Returns the character at the given offset.
226 char GetCharacter(size_t offset) const;
227
228 // Returns true if this node holds a single data edge, and if so, sets
229 // `fragment` to reference the contained data. `fragment` is an optional
230 // output parameter and allowed to be null.
231 bool IsFlat(absl::string_view* fragment) const;
232
233 // Returns true if the data of `n` bytes starting at offset `offset`
234 // is contained in a single data edge, and if so, sets fragment to reference
235 // the contained data. `fragment` is an optional output parameter and allowed
236 // to be null.
237 bool IsFlat(size_t offset, size_t n, absl::string_view* fragment) const;
238
239 // Returns a span (mutable range of bytes) of up to `size` bytes into the
240 // last FLAT data edge inside this tree under the following conditions:
241 // - none of the nodes down into the FLAT node are shared.
242 // - the last data edge in this tree is a non-shared FLAT.
243 // - the referenced FLAT has additional capacity available.
244 // If all these conditions are met, a non-empty span is returned, and the
245 // length of the flat node and involved tree nodes have been increased by
246 // `span.length()`. The caller is responsible for immediately assigning values
247 // to all uninitialized data reference by the returned span.
248 // Requires `this->refcount.IsOne()`: this function forces the caller to do
249 // this fast path check on the top level node, as this is the most commonly
250 // shared node of a cord tree.
251 Span<char> GetAppendBuffer(size_t size);
252
253 // Extracts the right-most data edge from this tree iff:
254 // - the tree and all internal edges to the right-most node are not shared.
255 // - the right-most node is a FLAT node and not shared.
256 // - the right-most node has at least the desired extra capacity.
257 //
258 // Returns {tree, nullptr} if any of the above conditions are not met.
259 // This method effectively removes data from the tree. The intent of this
260 // method is to allow applications appending small string data to use
261 // pre-existing capacity, and add the modified rep back to the tree.
262 //
263 // Simplified such code would look similar to this:
264 // void MyTreeBuilder::Append(string_view data) {
265 // ExtractResult result = CordRepBtree::ExtractAppendBuffer(tree_, 1);
266 // if (CordRep* rep = result.extracted) {
267 // size_t available = rep->Capacity() - rep->length;
268 // size_t n = std::min(data.size(), n);
269 // memcpy(rep->Data(), data.data(), n);
270 // rep->length += n;
271 // data.remove_prefix(n);
272 // if (!result.tree->IsBtree()) {
273 // tree_ = CordRepBtree::Create(result.tree);
274 // }
275 // tree_ = CordRepBtree::Append(tree_, rep);
276 // }
277 // ...
278 // // Remaining edge in `result.tree`.
279 // }
280 static ExtractResult ExtractAppendBuffer(CordRepBtree* tree,
281 size_t extra_capacity = 1);
282
283 // Returns the `height` of the tree. The height of a tree is limited to
284 // kMaxHeight. `height` is implemented as an `int` as in some places we
285 // use negative (-1) values for 'data edges'.
height()286 int height() const { return static_cast<int>(storage[0]); }
287
288 // Properties: begin, back, end, front/back boundary indexes.
begin()289 size_t begin() const { return static_cast<size_t>(storage[1]); }
back()290 size_t back() const { return static_cast<size_t>(storage[2]) - 1; }
end()291 size_t end() const { return static_cast<size_t>(storage[2]); }
index(EdgeType edge)292 size_t index(EdgeType edge) const {
293 return edge == kFront ? begin() : back();
294 }
295
296 // Properties: size and capacity.
297 // `capacity` contains the current capacity of this instance, where
298 // `kMaxCapacity` contains the maximum capacity of a btree node.
299 // For now, `capacity` and `kMaxCapacity` return the same value, but this may
300 // change in the future if we see benefit in dynamically sizing 'small' nodes
301 // to 'large' nodes for large data trees.
size()302 size_t size() const { return end() - begin(); }
capacity()303 size_t capacity() const { return kMaxCapacity; }
304
305 // Edge access
306 inline CordRep* Edge(size_t index) const;
307 inline CordRep* Edge(EdgeType edge_type) const;
308 inline absl::Span<CordRep* const> Edges() const;
309 inline absl::Span<CordRep* const> Edges(size_t begin, size_t end) const;
310
311 // Returns reference to the data edge at `index`.
312 // Requires this instance to be a leaf node, and `index` to be valid index.
313 inline absl::string_view Data(size_t index) const;
314
315 // Diagnostics: returns true if `tree` is valid and internally consistent.
316 // If `shallow` is false, then the provided top level node and all child nodes
317 // below it are recursively checked. If `shallow` is true, only the provided
318 // node in `tree` and the cumulative length, type and height of the direct
319 // child nodes of `tree` are checked. The value of `shallow` is ignored if the
320 // internal `cord_btree_exhaustive_validation` diagnostics variable is true,
321 // in which case the performed validations works as if `shallow` were false.
322 // This function is intended for debugging and testing purposes only.
323 static bool IsValid(const CordRepBtree* tree, bool shallow = false);
324
325 // Diagnostics: asserts that the provided tree is valid.
326 // `AssertValid()` performs a shallow validation by default. `shallow` can be
327 // set to false in which case an exhaustive validation is performed. This
328 // function is implemented in terms of calling `IsValid()` and asserting the
329 // return value to be true. See `IsValid()` for more information.
330 // This function is intended for debugging and testing purposes only.
331 static CordRepBtree* AssertValid(CordRepBtree* tree, bool shallow = true);
332 static const CordRepBtree* AssertValid(const CordRepBtree* tree,
333 bool shallow = true);
334
335 // Diagnostics: dump the contents of this tree to `stream`.
336 // This function is intended for debugging and testing purposes only.
337 static void Dump(const CordRep* rep, std::ostream& stream);
338 static void Dump(const CordRep* rep, absl::string_view label,
339 std::ostream& stream);
340 static void Dump(const CordRep* rep, absl::string_view label,
341 bool include_contents, std::ostream& stream);
342
343 // Adds the edge `edge` to this node if possible. `owned` indicates if the
344 // current node is potentially shared or not with other threads. Returns:
345 // - {kSelf, <this>}
346 // The edge was directly added to this node.
347 // - {kCopied, <node>}
348 // The edge was added to a copy of this node.
349 // - {kPopped, New(edge, height())}
350 // A new leg with the edge was created as this node has no extra capacity.
351 template <EdgeType edge_type>
352 inline OpResult AddEdge(bool owned, CordRep* edge, size_t delta);
353
354 // Replaces the front or back edge with the provided new edge. Returns:
355 // - {kSelf, <this>}
356 // The edge was directly set in this node. The old edge is unreffed.
357 // - {kCopied, <node>}
358 // A copy of this node was created with the new edge value.
359 // In both cases, the function adopts a reference on `edge`.
360 template <EdgeType edge_type>
361 OpResult SetEdge(bool owned, CordRep* edge, size_t delta);
362
363 // Creates a new empty node at the specified height.
364 static CordRepBtree* New(int height = 0);
365
366 // Creates a new node containing `rep`, with the height being computed
367 // automatically based on the type of `rep`.
368 static CordRepBtree* New(CordRep* rep);
369
370 // Creates a new node containing both `front` and `back` at height
371 // `front.height() + 1`. Requires `back.height() == front.height()`.
372 static CordRepBtree* New(CordRepBtree* front, CordRepBtree* back);
373
374 // Creates a fully balanced tree from the provided tree by rebuilding a new
375 // tree from all data edges in the input. This function is automatically
376 // invoked internally when the tree exceeds the maximum height.
377 static CordRepBtree* Rebuild(CordRepBtree* tree);
378
379 private:
380 CordRepBtree() = default;
381 ~CordRepBtree() = default;
382
383 // Initializes the main properties `tag`, `begin`, `end`, `height`.
384 inline void InitInstance(int height, size_t begin = 0, size_t end = 0);
385
386 // Direct property access begin / end
set_begin(size_t begin)387 void set_begin(size_t begin) { storage[1] = static_cast<uint8_t>(begin); }
set_end(size_t end)388 void set_end(size_t end) { storage[2] = static_cast<uint8_t>(end); }
389
390 // Decreases the value of `begin` by `n`, and returns the new value. Notice
391 // how this returns the new value unlike atomic::fetch_add which returns the
392 // old value. This is because this is used to prepend edges at 'begin - 1'.
sub_fetch_begin(size_t n)393 size_t sub_fetch_begin(size_t n) {
394 storage[1] -= static_cast<uint8_t>(n);
395 return storage[1];
396 }
397
398 // Increases the value of `end` by `n`, and returns the previous value. This
399 // function is typically used to append edges at 'end'.
fetch_add_end(size_t n)400 size_t fetch_add_end(size_t n) {
401 const uint8_t current = storage[2];
402 storage[2] = static_cast<uint8_t>(current + n);
403 return current;
404 }
405
406 // Returns the index of the last edge starting on, or before `offset`, with
407 // `n` containing the relative offset of `offset` inside that edge.
408 // Requires `offset` < length.
409 Position IndexOf(size_t offset) const;
410
411 // Returns the index of the last edge starting before `offset`, with `n`
412 // containing the relative offset of `offset` inside that edge.
413 // This function is useful to find the edges for some span of bytes ending at
414 // `offset` (i.e., `n` bytes). For example:
415 //
416 // Position pos = IndexBefore(n)
417 // edges = Edges(begin(), pos.index) // All full edges (may be empty)
418 // last = Sub(Edge(pos.index), 0, pos.n) // Last partial edge (may be empty)
419 //
420 // Requires 0 < `offset` <= length.
421 Position IndexBefore(size_t offset) const;
422
423 // Returns the index of the edge ending at (or on) length `length`, and the
424 // number of bytes inside that edge up to `length`. For example, if we have a
425 // Node with 2 edges, one of 10 and one of 20 long, then IndexOfLength(27)
426 // will return {1, 17}, and IndexOfLength(10) will return {0, 10}.
427 Position IndexOfLength(size_t n) const;
428
429 // Identical to the above function except starting from the position `front`.
430 // This function is equivalent to `IndexBefore(front.n + offset)`, with
431 // the difference that this function is optimized to start at `front.index`.
432 Position IndexBefore(Position front, size_t offset) const;
433
434 // Returns the index of the edge directly beyond the edge containing offset
435 // `offset`, with `n` containing the distance of that edge from `offset`.
436 // This function is useful for iteratively finding suffix nodes and remaining
437 // partial bytes in left-most suffix nodes as for example in CopySuffix.
438 // Requires `offset` < length.
439 Position IndexBeyond(size_t offset) const;
440
441 // Creates a new leaf node containing as much data as possible from `data`.
442 // The data is added either forwards or reversed depending on `edge_type`.
443 // Callers must check the length of the returned node to determine if all data
444 // was copied or not.
445 // See the `Append/Prepend` function for the meaning and purpose of `extra`.
446 template <EdgeType edge_type>
447 static CordRepBtree* NewLeaf(absl::string_view data, size_t extra);
448
449 // Creates a raw copy of this Btree node with the specified length, copying
450 // all properties, but without adding any references to existing edges.
451 CordRepBtree* CopyRaw(size_t new_length) const;
452
453 // Creates a full copy of this Btree node, adding a reference on all edges.
454 CordRepBtree* Copy() const;
455
456 // Creates a partial copy of this Btree node, copying all edges up to `end`,
457 // adding a reference on each copied edge, and sets the length of the newly
458 // created copy to `new_length`.
459 CordRepBtree* CopyBeginTo(size_t end, size_t new_length) const;
460
461 // Returns a tree containing the edges [tree->begin(), end) and length
462 // of `new_length`. This method consumes a reference on the provided
463 // tree, and logically performs the following operation:
464 // result = tree->CopyBeginTo(end, new_length);
465 // CordRep::Unref(tree);
466 // return result;
467 static CordRepBtree* ConsumeBeginTo(CordRepBtree* tree, size_t end,
468 size_t new_length);
469
470 // Creates a partial copy of this Btree node, copying all edges starting at
471 // `begin`, adding a reference on each copied edge, and sets the length of
472 // the newly created copy to `new_length`.
473 CordRepBtree* CopyToEndFrom(size_t begin, size_t new_length) const;
474
475 // Extracts and returns the front edge from the provided tree.
476 // This method consumes a reference on the provided tree, and logically
477 // performs the following operation:
478 // edge = CordRep::Ref(tree->Edge(kFront));
479 // CordRep::Unref(tree);
480 // return edge;
481 static CordRep* ExtractFront(CordRepBtree* tree);
482
483 // Returns a tree containing the result of appending `right` to `left`.
484 static CordRepBtree* MergeTrees(CordRepBtree* left, CordRepBtree* right);
485
486 // Fallback functions for `Create()`, `Append()` and `Prepend()` which
487 // deal with legacy / non conforming input, i.e.: CONCAT trees.
488 static CordRepBtree* CreateSlow(CordRep* rep);
489 static CordRepBtree* AppendSlow(CordRepBtree*, CordRep* rep);
490 static CordRepBtree* PrependSlow(CordRepBtree*, CordRep* rep);
491
492 // Recursively rebuilds `tree` into `stack`. If 'consume` is set to true, the
493 // function will consume a reference on `tree`. `stack` is a null terminated
494 // array containing the new tree's state, with the current leaf node at
495 // stack[0], and parent nodes above that, or null for 'top of tree'.
496 static void Rebuild(CordRepBtree** stack, CordRepBtree* tree, bool consume);
497
498 // Aligns existing edges to start at index 0, to allow for a new edge to be
499 // added to the back of the current edges.
500 inline void AlignBegin();
501
502 // Aligns existing edges to end at `capacity`, to allow for a new edge to be
503 // added in front of the current edges.
504 inline void AlignEnd();
505
506 // Adds the provided edge to this node.
507 // Requires this node to have capacity for the edge. Realigns / moves
508 // existing edges as needed to prepend or append the new edge.
509 template <EdgeType edge_type>
510 inline void Add(CordRep* rep);
511
512 // Adds the provided edges to this node.
513 // Requires this node to have capacity for the edges. Realigns / moves
514 // existing edges as needed to prepend or append the new edges.
515 template <EdgeType edge_type>
516 inline void Add(absl::Span<CordRep* const>);
517
518 // Adds data from `data` to this node until either all data has been consumed,
519 // or there is no more capacity for additional flat nodes inside this node.
520 // Requires the current node to be a leaf node, data to be non empty, and the
521 // current node to have capacity for at least one more data edge.
522 // Returns any remaining data from `data` that was not added, which is
523 // depending on the edge type (front / back) either the remaining prefix of
524 // suffix of the input.
525 // See the `Append/Prepend` function for the meaning and purpose of `extra`.
526 template <EdgeType edge_type>
527 absl::string_view AddData(absl::string_view data, size_t extra);
528
529 // Replace the front or back edge with the provided value.
530 // Adopts a reference on `edge` and unrefs the old edge.
531 template <EdgeType edge_type>
532 inline void SetEdge(CordRep* edge);
533
534 // Returns a partial copy of the current tree containing the first `n` bytes
535 // of data. `CopyResult` contains both the resulting edge and its height. The
536 // resulting tree may be less high than the current tree, or even be a single
537 // matching data edge if `allow_folding` is set to true.
538 // For example, if `n == 1`, then the result will be the single data edge, and
539 // height will be set to -1 (one below the owning leaf node). If n == 0, this
540 // function returns null. Requires `n <= length`
541 CopyResult CopyPrefix(size_t n, bool allow_folding = true);
542
543 // Returns a partial copy of the current tree containing all data starting
544 // after `offset`. `CopyResult` contains both the resulting edge and its
545 // height. The resulting tree may be less high than the current tree, or even
546 // be a single matching data edge. For example, if `n == length - 1`, then the
547 // result will be a single data edge, and height will be set to -1 (one below
548 // the owning leaf node).
549 // Requires `offset < length`
550 CopyResult CopySuffix(size_t offset);
551
552 // Returns a OpResult value of {this, kSelf} or {Copy(), kCopied}
553 // depending on the value of `owned`.
554 inline OpResult ToOpResult(bool owned);
555
556 // Adds `rep` to the specified tree, returning the modified tree.
557 template <EdgeType edge_type>
558 static CordRepBtree* AddCordRep(CordRepBtree* tree, CordRep* rep);
559
560 // Adds `data` to the specified tree, returning the modified tree.
561 // See the `Append/Prepend` function for the meaning and purpose of `extra`.
562 template <EdgeType edge_type>
563 static CordRepBtree* AddData(CordRepBtree* tree, absl::string_view data,
564 size_t extra = 0);
565
566 // Merges `src` into `dst` with `src` being added either before (kFront) or
567 // after (kBack) `dst`. Requires the height of `dst` to be greater than or
568 // equal to the height of `src`.
569 template <EdgeType edge_type>
570 static CordRepBtree* Merge(CordRepBtree* dst, CordRepBtree* src);
571
572 // Fallback version of GetAppendBuffer for large trees: GetAppendBuffer()
573 // implements an inlined version for trees of limited height (3 levels),
574 // GetAppendBufferSlow implements the logic for large trees.
575 Span<char> GetAppendBufferSlow(size_t size);
576
577 // `edges_` contains all edges starting from this instance.
578 // These are explicitly `child` edges only, a cord btree (or any cord tree in
579 // that respect) does not store `parent` pointers anywhere: multiple trees /
580 // parents can reference the same shared child edge. The type of these edges
581 // depends on the height of the node. `Leaf nodes` (height == 0) contain `data
582 // edges` (external or flat nodes, or sub-strings thereof). All other nodes
583 // (height > 0) contain pointers to BTREE nodes with a height of `height - 1`.
584 CordRep* edges_[kMaxCapacity];
585
586 friend class CordRepBtreeTestPeer;
587 friend class CordRepBtreeNavigator;
588 };
589
btree()590 inline CordRepBtree* CordRep::btree() {
591 assert(IsBtree());
592 return static_cast<CordRepBtree*>(this);
593 }
594
btree()595 inline const CordRepBtree* CordRep::btree() const {
596 assert(IsBtree());
597 return static_cast<const CordRepBtree*>(this);
598 }
599
InitInstance(int height,size_t begin,size_t end)600 inline void CordRepBtree::InitInstance(int height, size_t begin, size_t end) {
601 tag = BTREE;
602 storage[0] = static_cast<uint8_t>(height);
603 storage[1] = static_cast<uint8_t>(begin);
604 storage[2] = static_cast<uint8_t>(end);
605 }
606
Edge(size_t index)607 inline CordRep* CordRepBtree::Edge(size_t index) const {
608 assert(index >= begin());
609 assert(index < end());
610 return edges_[index];
611 }
612
Edge(EdgeType edge_type)613 inline CordRep* CordRepBtree::Edge(EdgeType edge_type) const {
614 return edges_[edge_type == kFront ? begin() : back()];
615 }
616
Edges()617 inline absl::Span<CordRep* const> CordRepBtree::Edges() const {
618 return {edges_ + begin(), size()};
619 }
620
Edges(size_t begin,size_t end)621 inline absl::Span<CordRep* const> CordRepBtree::Edges(size_t begin,
622 size_t end) const {
623 assert(begin <= end);
624 assert(begin >= this->begin());
625 assert(end <= this->end());
626 return {edges_ + begin, static_cast<size_t>(end - begin)};
627 }
628
Data(size_t index)629 inline absl::string_view CordRepBtree::Data(size_t index) const {
630 assert(height() == 0);
631 return EdgeData(Edge(index));
632 }
633
New(int height)634 inline CordRepBtree* CordRepBtree::New(int height) {
635 CordRepBtree* tree = new CordRepBtree;
636 tree->length = 0;
637 tree->InitInstance(height);
638 return tree;
639 }
640
New(CordRep * rep)641 inline CordRepBtree* CordRepBtree::New(CordRep* rep) {
642 CordRepBtree* tree = new CordRepBtree;
643 int height = rep->IsBtree() ? rep->btree()->height() + 1 : 0;
644 tree->length = rep->length;
645 tree->InitInstance(height, /*begin=*/0, /*end=*/1);
646 tree->edges_[0] = rep;
647 return tree;
648 }
649
New(CordRepBtree * front,CordRepBtree * back)650 inline CordRepBtree* CordRepBtree::New(CordRepBtree* front,
651 CordRepBtree* back) {
652 assert(front->height() == back->height());
653 CordRepBtree* tree = new CordRepBtree;
654 tree->length = front->length + back->length;
655 tree->InitInstance(front->height() + 1, /*begin=*/0, /*end=*/2);
656 tree->edges_[0] = front;
657 tree->edges_[1] = back;
658 return tree;
659 }
660
Unref(absl::Span<CordRep * const> edges)661 inline void CordRepBtree::Unref(absl::Span<CordRep* const> edges) {
662 for (CordRep* edge : edges) {
663 if (ABSL_PREDICT_FALSE(!edge->refcount.Decrement())) {
664 CordRep::Destroy(edge);
665 }
666 }
667 }
668
CopyRaw(size_t new_length)669 inline CordRepBtree* CordRepBtree::CopyRaw(size_t new_length) const {
670 CordRepBtree* tree = new CordRepBtree;
671
672 // `length` and `refcount` are the first members of `CordRepBtree`.
673 // We initialize `length` using the given length, have `refcount` be set to
674 // ref = 1 through its default constructor, and copy all data beyond
675 // 'refcount' which starts with `tag` using a single memcpy: all contents
676 // except `refcount` is trivially copyable, and the compiler does not
677 // efficiently coalesce member-wise copy of these members.
678 // See https://gcc.godbolt.org/z/qY8zsca6z
679 // # LINT.IfChange(copy_raw)
680 tree->length = new_length;
681 uint8_t* dst = &tree->tag;
682 const uint8_t* src = &tag;
683 const ptrdiff_t offset = src - reinterpret_cast<const uint8_t*>(this);
684 memcpy(dst, src, sizeof(CordRepBtree) - static_cast<size_t>(offset));
685 return tree;
686 // # LINT.ThenChange()
687 }
688
Copy()689 inline CordRepBtree* CordRepBtree::Copy() const {
690 CordRepBtree* tree = CopyRaw(length);
691 for (CordRep* rep : Edges()) CordRep::Ref(rep);
692 return tree;
693 }
694
CopyToEndFrom(size_t begin,size_t new_length)695 inline CordRepBtree* CordRepBtree::CopyToEndFrom(size_t begin,
696 size_t new_length) const {
697 assert(begin >= this->begin());
698 assert(begin <= this->end());
699 CordRepBtree* tree = CopyRaw(new_length);
700 tree->set_begin(begin);
701 for (CordRep* edge : tree->Edges()) CordRep::Ref(edge);
702 return tree;
703 }
704
CopyBeginTo(size_t end,size_t new_length)705 inline CordRepBtree* CordRepBtree::CopyBeginTo(size_t end,
706 size_t new_length) const {
707 assert(end <= capacity());
708 assert(end >= this->begin());
709 CordRepBtree* tree = CopyRaw(new_length);
710 tree->set_end(end);
711 for (CordRep* edge : tree->Edges()) CordRep::Ref(edge);
712 return tree;
713 }
714
AlignBegin()715 inline void CordRepBtree::AlignBegin() {
716 // The below code itself does not need to be fast as typically we have
717 // mono-directional append/prepend calls, and `begin` / `end` are typically
718 // adjusted no more than once. But we want to avoid potential register clobber
719 // effects, making the compiler emit register save/store/spills, and minimize
720 // the size of code.
721 const size_t delta = begin();
722 if (ABSL_PREDICT_FALSE(delta != 0)) {
723 const size_t new_end = end() - delta;
724 set_begin(0);
725 set_end(new_end);
726 // TODO(mvels): we can write this using 2 loads / 2 stores depending on
727 // total size for the kMaxCapacity = 6 case. I.e., we can branch (switch) on
728 // size, and then do overlapping load/store of up to 4 pointers (inlined as
729 // XMM, YMM or ZMM load/store) and up to 2 pointers (XMM / YMM), which is a)
730 // compact and b) not clobbering any registers.
731 ABSL_ASSUME(new_end <= kMaxCapacity);
732 #ifdef __clang__
733 #pragma unroll 1
734 #endif
735 for (size_t i = 0; i < new_end; ++i) {
736 edges_[i] = edges_[i + delta];
737 }
738 }
739 }
740
AlignEnd()741 inline void CordRepBtree::AlignEnd() {
742 // See comments in `AlignBegin` for motivation on the hand-rolled for loops.
743 const size_t delta = capacity() - end();
744 if (delta != 0) {
745 const size_t new_begin = begin() + delta;
746 const size_t new_end = end() + delta;
747 set_begin(new_begin);
748 set_end(new_end);
749 ABSL_ASSUME(new_end <= kMaxCapacity);
750 #ifdef __clang__
751 #pragma unroll 1
752 #endif
753 for (size_t i = new_end - 1; i >= new_begin; --i) {
754 edges_[i] = edges_[i - delta];
755 }
756 }
757 }
758
759 template <>
760 inline void CordRepBtree::Add<CordRepBtree::kBack>(CordRep* rep) {
761 AlignBegin();
762 edges_[fetch_add_end(1)] = rep;
763 }
764
765 template <>
766 inline void CordRepBtree::Add<CordRepBtree::kBack>(
767 absl::Span<CordRep* const> edges) {
768 AlignBegin();
769 size_t new_end = end();
770 for (CordRep* edge : edges) edges_[new_end++] = edge;
771 set_end(new_end);
772 }
773
774 template <>
775 inline void CordRepBtree::Add<CordRepBtree::kFront>(CordRep* rep) {
776 AlignEnd();
777 edges_[sub_fetch_begin(1)] = rep;
778 }
779
780 template <>
781 inline void CordRepBtree::Add<CordRepBtree::kFront>(
782 absl::Span<CordRep* const> edges) {
783 AlignEnd();
784 size_t new_begin = begin() - edges.size();
785 set_begin(new_begin);
786 for (CordRep* edge : edges) edges_[new_begin++] = edge;
787 }
788
789 template <CordRepBtree::EdgeType edge_type>
SetEdge(CordRep * edge)790 inline void CordRepBtree::SetEdge(CordRep* edge) {
791 const int idx = edge_type == kFront ? begin() : back();
792 CordRep::Unref(edges_[idx]);
793 edges_[idx] = edge;
794 }
795
ToOpResult(bool owned)796 inline CordRepBtree::OpResult CordRepBtree::ToOpResult(bool owned) {
797 return owned ? OpResult{this, kSelf} : OpResult{Copy(), kCopied};
798 }
799
IndexOf(size_t offset)800 inline CordRepBtree::Position CordRepBtree::IndexOf(size_t offset) const {
801 assert(offset < length);
802 size_t index = begin();
803 while (offset >= edges_[index]->length) offset -= edges_[index++]->length;
804 return {index, offset};
805 }
806
IndexBefore(size_t offset)807 inline CordRepBtree::Position CordRepBtree::IndexBefore(size_t offset) const {
808 assert(offset > 0);
809 assert(offset <= length);
810 size_t index = begin();
811 while (offset > edges_[index]->length) offset -= edges_[index++]->length;
812 return {index, offset};
813 }
814
IndexBefore(Position front,size_t offset)815 inline CordRepBtree::Position CordRepBtree::IndexBefore(Position front,
816 size_t offset) const {
817 size_t index = front.index;
818 offset = offset + front.n;
819 while (offset > edges_[index]->length) offset -= edges_[index++]->length;
820 return {index, offset};
821 }
822
IndexOfLength(size_t n)823 inline CordRepBtree::Position CordRepBtree::IndexOfLength(size_t n) const {
824 assert(n <= length);
825 size_t index = back();
826 size_t strip = length - n;
827 while (strip >= edges_[index]->length) strip -= edges_[index--]->length;
828 return {index, edges_[index]->length - strip};
829 }
830
IndexBeyond(const size_t offset)831 inline CordRepBtree::Position CordRepBtree::IndexBeyond(
832 const size_t offset) const {
833 // We need to find the edge which `starting offset` is beyond (>=)`offset`.
834 // For this we can't use the `offset -= length` logic of IndexOf. Instead, we
835 // track the offset of the `current edge` in `off`, which we increase as we
836 // iterate over the edges until we find the matching edge.
837 size_t off = 0;
838 size_t index = begin();
839 while (offset > off) off += edges_[index++]->length;
840 return {index, off - offset};
841 }
842
Create(CordRep * rep)843 inline CordRepBtree* CordRepBtree::Create(CordRep* rep) {
844 if (IsDataEdge(rep)) return New(rep);
845 return CreateSlow(rep);
846 }
847
GetAppendBuffer(size_t size)848 inline Span<char> CordRepBtree::GetAppendBuffer(size_t size) {
849 assert(refcount.IsOne());
850 CordRepBtree* tree = this;
851 const int height = this->height();
852 CordRepBtree* n1 = tree;
853 CordRepBtree* n2 = tree;
854 CordRepBtree* n3 = tree;
855 switch (height) {
856 case 3:
857 tree = tree->Edge(kBack)->btree();
858 if (!tree->refcount.IsOne()) return {};
859 n2 = tree;
860 ABSL_FALLTHROUGH_INTENDED;
861 case 2:
862 tree = tree->Edge(kBack)->btree();
863 if (!tree->refcount.IsOne()) return {};
864 n1 = tree;
865 ABSL_FALLTHROUGH_INTENDED;
866 case 1:
867 tree = tree->Edge(kBack)->btree();
868 if (!tree->refcount.IsOne()) return {};
869 ABSL_FALLTHROUGH_INTENDED;
870 case 0:
871 CordRep* edge = tree->Edge(kBack);
872 if (!edge->refcount.IsOne()) return {};
873 if (edge->tag < FLAT) return {};
874 size_t avail = edge->flat()->Capacity() - edge->length;
875 if (avail == 0) return {};
876 size_t delta = (std::min)(size, avail);
877 Span<char> span = {edge->flat()->Data() + edge->length, delta};
878 edge->length += delta;
879 switch (height) {
880 case 3:
881 n3->length += delta;
882 ABSL_FALLTHROUGH_INTENDED;
883 case 2:
884 n2->length += delta;
885 ABSL_FALLTHROUGH_INTENDED;
886 case 1:
887 n1->length += delta;
888 ABSL_FALLTHROUGH_INTENDED;
889 case 0:
890 tree->length += delta;
891 return span;
892 }
893 break;
894 }
895 return GetAppendBufferSlow(size);
896 }
897
898 extern template CordRepBtree* CordRepBtree::AddCordRep<CordRepBtree::kBack>(
899 CordRepBtree* tree, CordRep* rep);
900
901 extern template CordRepBtree* CordRepBtree::AddCordRep<CordRepBtree::kFront>(
902 CordRepBtree* tree, CordRep* rep);
903
Append(CordRepBtree * tree,CordRep * rep)904 inline CordRepBtree* CordRepBtree::Append(CordRepBtree* tree, CordRep* rep) {
905 if (ABSL_PREDICT_TRUE(IsDataEdge(rep))) {
906 return CordRepBtree::AddCordRep<kBack>(tree, rep);
907 }
908 return AppendSlow(tree, rep);
909 }
910
Prepend(CordRepBtree * tree,CordRep * rep)911 inline CordRepBtree* CordRepBtree::Prepend(CordRepBtree* tree, CordRep* rep) {
912 if (ABSL_PREDICT_TRUE(IsDataEdge(rep))) {
913 return CordRepBtree::AddCordRep<kFront>(tree, rep);
914 }
915 return PrependSlow(tree, rep);
916 }
917
918 #ifdef NDEBUG
919
AssertValid(CordRepBtree * tree,bool)920 inline CordRepBtree* CordRepBtree::AssertValid(CordRepBtree* tree,
921 bool /* shallow */) {
922 return tree;
923 }
924
AssertValid(const CordRepBtree * tree,bool)925 inline const CordRepBtree* CordRepBtree::AssertValid(const CordRepBtree* tree,
926 bool /* shallow */) {
927 return tree;
928 }
929
930 #endif
931
932 } // namespace cord_internal
933 ABSL_NAMESPACE_END
934 } // namespace absl
935
936 #endif // ABSL_STRINGS_INTERNAL_CORD_REP_BTREE_H_
937