xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/convert/op_profile_builder.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/op_profile_builder.h"
17 
18 #include <algorithm>
19 #include <cstdint>
20 #include <memory>
21 #include <string>
22 #include <vector>
23 
24 #include "absl/container/flat_hash_map.h"
25 #include "absl/container/node_hash_map.h"
26 #include "absl/strings/ascii.h"
27 #include "absl/strings/str_cat.h"
28 #include "tensorflow/core/lib/gtl/top_n.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
31 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
32 #include "tensorflow/core/profiler/convert/xla_op_utils.h"
33 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
34 #include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
35 #include "tensorflow/core/profiler/utils/math_utils.h"
36 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
37 
38 namespace tensorflow {
39 namespace profiler {
40 namespace {
41 
42 using op_profile::Metrics;
43 using op_profile::Node;
44 
45 // Fill symbol details into a node.
PopulateSymbolNode(const OpMetrics & op_metrics,Node * node)46 void PopulateSymbolNode(const OpMetrics& op_metrics, Node* node) {
47   node->set_name(op_metrics.name());
48   Node::XLAInstruction& xla = *node->mutable_xla();
49   xla.set_expression(op_metrics.long_name());
50   xla.set_category(op_metrics.category());
51   xla.set_provenance(op_metrics.provenance());
52   if (op_metrics.has_layout()) {
53     for (const auto& dimension : op_metrics.layout().dimensions()) {
54       auto* dim = xla.mutable_layout()->add_dimensions();
55       dim->set_size(dimension.size());
56       dim->set_alignment(dimension.alignment());
57       dim->set_semantics(absl::AsciiStrToLower(
58           LayoutDimensionSemantics_Name(dimension.semantics())));
59     }
60   }
61   xla.set_computation_primitive_size(op_metrics.computation_primitive_size());
62 }
63 
64 // Sort the children and only keep the top K children.
65 template <typename Cmp>
TopKChildren(const Node * root,int k,Cmp cmp)66 Node TopKChildren(const Node* root, int k, Cmp cmp) {
67   tensorflow::gtl::TopN<const Node*, decltype(cmp)> top_n(k, cmp);
68   for (const Node& node : root->children()) {
69     top_n.push(&node);
70   }
71   Node output;
72   std::unique_ptr<std::vector<const Node*>> extracted_nodes(top_n.Extract());
73   for (const Node* node : *extracted_nodes) {
74     *output.add_children() = *node;
75   }
76   return output;
77 }
78 
79 // Copy symbol details into a deduplicated node from the top child node.
CopySymbolDetailsToDeduplicatedNode(Node * top_child_node,Node * deduplicated_node)80 void CopySymbolDetailsToDeduplicatedNode(Node* top_child_node,
81                                          Node* deduplicated_node) {
82   deduplicated_node->set_name(
83       absl::StrCat(top_child_node->name(), " and its duplicate(s)"));
84   Node::XLAInstruction& xla = *deduplicated_node->mutable_xla();
85   const Node::XLAInstruction& top_child_node_xla = top_child_node->xla();
86   xla.set_expression(top_child_node_xla.expression());
87   xla.set_category(top_child_node_xla.category());
88   if (IsFusion(top_child_node_xla.category())) return;
89   xla.set_provenance(top_child_node_xla.provenance());
90   *xla.mutable_layout() = top_child_node_xla.layout();
91 }
92 
SortAndPruneChildren(int k,int level,Node * root)93 void SortAndPruneChildren(int k, int level, Node* root) {
94   // Set the total number of children before pruning.
95   root->set_num_children(root->children_size());
96   for (Node& node : *root->mutable_children()) {
97     SortAndPruneChildren(k, level - 1, &node);
98   }
99   k = level > 0 ? root->children_size() : k;
100 
101   if (root->children_size() > 1) {
102     if (root->has_xla() && IsFusion(root->xla().category())) {
103       // Sort the children under fusion node by raw flops.
104       *root->mutable_children() =
105           TopKChildren(root, k, [](const Node* a, const Node* b) {
106             return a->metrics().raw_flops() > b->metrics().raw_flops();
107           }).children();
108     } else {
109       *root->mutable_children() =
110           TopKChildren(root, k, [](const Node* a, const Node* b) {
111             return a->metrics().time() > b->metrics().time();
112           }).children();
113     }
114   }
115 }
116 
117 // Finalize deduplicated nodes by copying symbol details from the top child
118 // node.
FinalizeDeduplicatedNodes(bool by_program,Node * root)119 void FinalizeDeduplicatedNodes(bool by_program, Node* root) {
120   if (by_program) {
121     for (Node& program_node : *root->mutable_children()) {
122       for (Node& category_node : *program_node.mutable_children()) {
123         for (Node& deduplicated_node : *category_node.mutable_children()) {
124           // Skip for non deduplicated nodes. Those nodes already have name set.
125           if (!deduplicated_node.name().empty() ||
126               deduplicated_node.children().empty())
127             continue;
128           CopySymbolDetailsToDeduplicatedNode(
129               deduplicated_node.mutable_children(0), &deduplicated_node);
130         }
131       }
132     }
133   } else {
134     for (Node& category_node : *root->mutable_children()) {
135       for (Node& deduplicated_node : *category_node.mutable_children()) {
136         // Skip for non deduplicated nodes. Those nodes already have name set.
137         if (!deduplicated_node.name().empty() ||
138             deduplicated_node.children().empty())
139           continue;
140         CopySymbolDetailsToDeduplicatedNode(
141             deduplicated_node.mutable_children(0), &deduplicated_node);
142       }
143     }
144   }
145 }
146 
147 // Fills op metrics into a node.
PopulateOpMetricsNode(const OpMetrics & op_metrics,double peak_gigaflops_per_second_per_core,double peak_gibibytes_per_second_per_core,uint64_t total_time_ps,Node * node)148 void PopulateOpMetricsNode(const OpMetrics& op_metrics,
149                            double peak_gigaflops_per_second_per_core,
150                            double peak_gibibytes_per_second_per_core,
151                            uint64_t total_time_ps, Node* node) {
152   DCHECK_EQ(ChildrenTimePs(op_metrics), 0);
153 
154   Metrics* metrics = node->mutable_metrics();
155   // The UI computes flops_rate = raw_flops / raw_time
156   // and memory_bandwidth = raw_bytes_accessed / raw_time. See:
157   // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
158   metrics->set_raw_time(op_metrics.time_ps());
159   metrics->set_raw_flops(op_metrics.flops());
160   metrics->set_raw_bytes_accessed(op_metrics.bytes_accessed());
161 
162   // "time" is the op or category fraction of total time.
163   metrics->set_time(SafeDivide(op_metrics.time_ps(), total_time_ps));
164 
165   double flops_utilization = SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
166                                         peak_gigaflops_per_second_per_core);
167   // The UI expects flops_utilization = flops / time. See:
168   // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
169   metrics->set_flops(flops_utilization * metrics->time());
170 
171   // TODO(b/219984562): Use hierarchical roofline.
172   double mem_bw_utilization = SafeDivide(GibiBytesPerSecondPerCore(op_metrics),
173                                          peak_gibibytes_per_second_per_core);
174   metrics->set_memory_bandwidth(mem_bw_utilization);
175 }
176 
177 // Sets the total time on the root node metrics.
SetTotalTime(uint64_t total_time_ps,Node * root)178 void SetTotalTime(uint64_t total_time_ps, Node* root) {
179   Metrics* metrics = root->mutable_metrics();
180   metrics->set_raw_time(total_time_ps);
181   metrics->set_time(1.0);
182 }
183 
184 // Recursively insert "fused instruction" nodes (with raw flops).
InsertFusedInstructions(const OpMetrics & op_metrics,Node * node)185 void InsertFusedInstructions(const OpMetrics& op_metrics, Node* node) {
186   if (!op_metrics.has_children()) return;
187   for (const auto& child : op_metrics.children().metrics_db()) {
188     Node* new_node = node->add_children();
189     PopulateSymbolNode(child, new_node);
190     new_node->mutable_metrics()->set_raw_flops(child.flops());
191     if (child.has_children()) {
192       InsertFusedInstructions(child, new_node);
193     }
194   }
195 }
196 
197 }  // namespace
198 
GenerateProgramName(uint64_t program_id) const199 std::string OpProfileBuilder::GenerateProgramName(uint64_t program_id) const {
200   DCHECK(program_name_map_ != nullptr);
201   auto iter = program_name_map_->find(program_id);
202   if (iter == program_name_map_->end()) return "main";
203   return HloModuleNameWithProgramId(iter->second, program_id);
204 }
205 
AddOpNode(const OpMetrics & op_metrics,Category * category,Node * deduplicated_node)206 Node* OpProfileBuilder::AddOpNode(const OpMetrics& op_metrics,
207                                   Category* category, Node* deduplicated_node) {
208   Node* leaf;
209   if (deduplicated_node != nullptr) {
210     leaf = deduplicated_node->add_children();
211   } else if (category != nullptr) {
212     leaf = category->node->add_children();
213   } else {
214     leaf = root_->add_children();
215   }
216   PopulateSymbolNode(op_metrics, leaf);
217   InsertFusedInstructions(op_metrics, leaf);
218   return leaf;
219 }
220 
LookupOrAddDeduplicatedNode(const OpMetrics & op_metrics,Category * category)221 Node* OpProfileBuilder::LookupOrAddDeduplicatedNode(const OpMetrics& op_metrics,
222                                                     Category* category) {
223   Node*& deduplicated_node =
224       category->deduplicated_nodes[op_metrics.deduplicated_name()];
225   if (deduplicated_node == nullptr) {
226     deduplicated_node = category->node->add_children();
227   }
228   return deduplicated_node;
229 }
230 
LookupOrAddCategoryNode(const OpMetrics & op_metrics,Program * program)231 OpProfileBuilder::Category* OpProfileBuilder::LookupOrAddCategoryNode(
232     const OpMetrics& op_metrics, Program* program) {
233   Category* category;
234   Node* category_parent;
235   if (program != nullptr) {
236     category = &program->categories[op_metrics.category()];
237     category_parent = program->node;
238   } else {
239     category = &category_map_[op_metrics.category()];
240     category_parent = root_;
241   }
242   if (category->node == nullptr) {
243     category->node = category_parent->add_children();
244     category->node->set_name(op_metrics.category());
245   }
246   return category;
247 }
248 
LookupOrAddProgramNode(const OpMetrics & op_metrics)249 OpProfileBuilder::Program* OpProfileBuilder::LookupOrAddProgramNode(
250     const OpMetrics& op_metrics) {
251   uint64_t program_id = op_metrics.hlo_module_id();
252   Program* program = &programs_map_[program_id];
253   if (program->node == nullptr) {
254     program->node = root_->add_children();
255     program->node->set_name(GenerateProgramName(program_id));
256   }
257   return program;
258 }
259 
AddOp(const OpMetrics & op_metrics)260 void OpProfileBuilder::AddOp(const OpMetrics& op_metrics) {
261   // Exclude ops with children ops to avoid double counting of flops, bytes and
262   // time from children ops.
263   if (ChildrenTimePs(op_metrics) > 0) return;
264 
265   // The path from the root to the leaf node:
266   // e.g. by_program -> cluster_xx -> convolution -> convolution.1 and its
267   // deduplicates -> convolution.1
268   // We will aggregate the metrics of convolution.1 to all its parent nodes.
269   std::vector<Node*> all_paths = {root_};
270 
271   if (IsIdleOp(op_metrics)) {
272     Node* leaf = AddOpNode(op_metrics);
273     all_paths.push_back(leaf);
274   } else {
275     Program* program = nullptr;
276     if (options_.group_by_program) {
277       program = LookupOrAddProgramNode(op_metrics);
278       all_paths.push_back(program->node);
279     }
280 
281     Category* category = LookupOrAddCategoryNode(op_metrics, program);
282     all_paths.push_back(category->node);
283 
284     Node* deduplicated_node = nullptr;
285     if (options_.group_by_deduplicated_name &&
286         !op_metrics.deduplicated_name().empty()) {
287       deduplicated_node = LookupOrAddDeduplicatedNode(op_metrics, category);
288       all_paths.push_back(deduplicated_node);
289     }
290 
291     Node* leaf = AddOpNode(op_metrics, category, deduplicated_node);
292     all_paths.push_back(leaf);
293   }
294 
295   for (auto* node : all_paths) {
296     // Per program combiner does not need to update OpMetrics.num_cores
297     CombineOpMetrics(op_metrics, &metrics_[node], /*update_num_cores=*/false);
298   }
299 }
300 
Finalize(double peak_gigaflops_per_second_per_core,double peak_gibibytes_per_second_per_core,uint64_t total_time_ps)301 void OpProfileBuilder::Finalize(double peak_gigaflops_per_second_per_core,
302                                 double peak_gibibytes_per_second_per_core,
303                                 uint64_t total_time_ps) {
304   for (const auto& [node, op_metrics] : metrics_) {
305     PopulateOpMetricsNode(op_metrics, peak_gigaflops_per_second_per_core,
306                           peak_gibibytes_per_second_per_core, total_time_ps,
307                           node);
308   }
309   SetTotalTime(total_time_ps, root_);
310   // If grouping by program, we build a two-level pruned tree: the first level
311   // is per program and the second level is per category. Otherwise we build a
312   // single-level per category pruned tree.
313   int level = options_.group_by_program ? 2 : 1;
314   SortAndPruneChildren(options_.children_per_node, level, root_);
315   if (options_.group_by_deduplicated_name) {
316     FinalizeDeduplicatedNodes(options_.group_by_program, root_);
317   }
318 }
319 
OpProfileBuilder(const OpProfileOptions & options,tensorflow::profiler::op_profile::Node * root,const tensorflow::protobuf::Map<uint64_t,std::string> * program_name_map)320 OpProfileBuilder::OpProfileBuilder(
321     const OpProfileOptions& options,
322     tensorflow::profiler::op_profile::Node* root,
323     const tensorflow::protobuf::Map<uint64_t, std::string>* program_name_map)
324     : options_(options), root_(root), program_name_map_(program_name_map) {
325   CHECK(root != nullptr);
326   DCHECK(!options_.group_by_program || program_name_map_ != nullptr);
327   root->set_name(options_.group_by_program ? "by_program" : "by_category");
328 }
329 }  // namespace profiler
330 }  // namespace tensorflow
331