xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
17 
18 #include <string>
19 
20 #include "google/protobuf/any.pb.h"
21 #include "absl/strings/str_cat.h"
22 #include "tensorflow/core/platform/types.h"
23 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
24 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
25 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
26 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
27 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
28 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
29 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
30 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
31 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
32 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
33 #include "tensorflow/core/profiler/utils/diagnostics.h"
34 #include "tensorflow/core/profiler/utils/format_utils.h"
35 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
36 #include "tensorflow/core/profiler/utils/html_utils.h"
37 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
38 #include "tensorflow/core/profiler/utils/math_utils.h"
39 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
40 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
41 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
42 #include "tensorflow/core/profiler/utils/xplane_schema.h"
43 #include "tensorflow/core/profiler/utils/xplane_utils.h"
44 
45 namespace tensorflow {
46 namespace profiler {
47 
48 namespace {
49 
50 // If the use of low-precision ops is less than this percentage threshold, a
51 // statement of suggestion will be made.
52 constexpr double kLowPrecisionPercentThreshold = 10;
53 
54 struct TfFunctionInfo {
55   absl::string_view function_name;
56   double expensive_call_percent;
57 };
58 
MakeOverviewPageTip(std::string text)59 OverviewPageTip MakeOverviewPageTip(std::string text) {
60   OverviewPageTip tip;
61   tip.set_link(std::move(text));
62   return tip;
63 }
64 
65 // Makes a recommendation for looking up a document.
66 // doc_url is expected to be already be escaped suitably for use in an HTML
67 // attribute.
MakeOverviewPageTipDocLink(absl::string_view doc_url,absl::string_view text)68 OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
69                                            absl::string_view text) {
70   return MakeOverviewPageTip(AnchorElement(doc_url, text));
71 }
72 
ComputeHostTips(OverviewPageRecommendation * re)73 void ComputeHostTips(OverviewPageRecommendation* re) {
74   *re->add_host_tips() = MakeOverviewPageTip(
75       "input_pipeline_analyzer (especially Section 3 for the breakdown of "
76       "input operations on the Host)");
77   *re->add_host_tips() = MakeOverviewPageTip(
78       "tf_data_bottleneck_analysis (find the bottleneck in the tf.data input "
79       "pipeline)");
80   *re->add_host_tips() = MakeOverviewPageTip(
81       "trace_viewer (look at the activities on the timeline of each Host "
82       "Thread near the bottom of the trace view)");
83 }
84 
ComputeDeviceTips(HardwareType hardware_type,OverviewPageRecommendation * re)85 void ComputeDeviceTips(HardwareType hardware_type,
86                        OverviewPageRecommendation* re) {
87   absl::string_view device_name = HardwareType_Name(hardware_type);
88   absl::string_view timeline_name = device_name;
89   absl::string_view op_stats_toolname = "tensorflow_stats";
90   if (hardware_type == tensorflow::profiler::TPU) {
91     timeline_name = "TPU core";
92     op_stats_toolname = "op_profile";
93   }
94   *re->add_device_tips() = MakeOverviewPageTip(
95       absl::StrCat(op_stats_toolname,
96                    " (identify the time-consuming operations "
97                    "executed on the ",
98                    device_name, ")"));
99   *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
100       "trace_viewer (look at the activities on the timeline of each ",
101       timeline_name, " in the trace view)"));
102 }
103 
ComputeFaqTips(OverviewPageRecommendation * re)104 void ComputeFaqTips(OverviewPageRecommendation* re) {
105   *re->add_faq_tips() = MakeOverviewPageTip("Refer to the TF2 Profiler FAQ");
106 }
107 
ComputeDocumentationTips(OverviewPageRecommendation * re)108 void ComputeDocumentationTips(OverviewPageRecommendation* re) {
109   *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
110       "https://www.tensorflow.org/guide/data_performance_analysis",
111       "Analyze tf.data performance with the TF Profiler");
112   *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
113       "https://www.tensorflow.org/guide/"
114       "data_performance",
115       "Better performance with the tf.data API");
116 }
117 
GeneratePrecisionStatement(const PrecisionStats & precision_stats)118 std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
119   uint64 total_compute_ps =
120       precision_stats.compute_16bit_ps() + precision_stats.compute_32bit_ps();
121   if (total_compute_ps > 0) {
122     double percent_16bit =
123         (100.0 * precision_stats.compute_16bit_ps()) / total_compute_ps;
124     if (percent_16bit < kLowPrecisionPercentThreshold) {
125       return absl::StrCat(
126           "Only ", OneDigit(percent_16bit),
127           "% of device computation is 16 bit. So you might want to replace "
128           "more 32-bit Ops by 16-bit Ops to improve performance (if the "
129           "reduced accuracy is acceptable).");
130     }
131   }
132   return "";
133 }
134 
135 }  // namespace
136 
SetCommonRecommendation(absl::string_view input_classification,absl::string_view input_statement,absl::string_view output_statement,HardwareType hardware_type,absl::string_view tf_function_statement_html,absl::string_view eager_statement_html,absl::string_view outside_compilation_statement_html,OverviewPageRecommendation * re)137 void SetCommonRecommendation(
138     absl::string_view input_classification, absl::string_view input_statement,
139     absl::string_view output_statement, HardwareType hardware_type,
140     absl::string_view tf_function_statement_html,
141     absl::string_view eager_statement_html,
142     absl::string_view outside_compilation_statement_html,
143     OverviewPageRecommendation* re) {
144   re->set_bottleneck(std::string(input_classification));
145   re->set_statement(std::string(input_statement));
146   re->set_output_statement(std::string(output_statement));
147   re->set_tf_function_statement_html(std::string(tf_function_statement_html));
148   re->set_eager_statement_html(std::string(eager_statement_html));
149   re->set_outside_compilation_statement_html(
150       std::string(outside_compilation_statement_html));
151   ComputeHostTips(re);
152   ComputeDeviceTips(hardware_type, re);
153   ComputeDocumentationTips(re);
154   ComputeFaqTips(re);
155 }
156 
ComputeGenericRecommendation(const BottleneckAnalysis & bottleneck,const PrecisionStats & precision_stats)157 OverviewPageRecommendation ComputeGenericRecommendation(
158     const BottleneckAnalysis& bottleneck,
159     const PrecisionStats& precision_stats) {
160   OverviewPageRecommendation re;
161   GenericRecommendation generic;
162   generic.set_device_collectives_bottleneck(
163       bottleneck.device_collectives_classification());
164   generic.set_device_collectives_statement(
165       bottleneck.device_collectives_statement());
166   generic.set_kernel_launch_bottleneck(
167       bottleneck.kernel_launch_classification());
168   generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
169   generic.set_all_other_bottleneck(bottleneck.all_other_classification());
170   generic.set_all_other_statement(bottleneck.all_other_statement());
171   generic.set_precision_statement(GeneratePrecisionStatement(precision_stats));
172   re.mutable_recommendation()->PackFrom(generic);
173   return re;
174 }
175 
ComputeAnalysisResult(const OpStats & op_stats)176 OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
177   OverviewPageAnalysis analysis;
178   OpMetricsDb device_tf_op_metrics_db = CreateTfMetricsDbFromDeviceOpMetricsDb(
179       op_stats.device_op_metrics_db(), /*with_idle=*/false);
180   KernelStatsByOpName kernel_stats_by_op_name =
181       GroupKernelReportsByOpName(op_stats.kernel_stats_db());
182   uint64 total_device_time_ps = device_tf_op_metrics_db.total_time_ps();
183   constexpr int kNumTopOpsShown = 10;
184   double device_cumulative_fraction = 0.0;
185   for (const OpMetrics* metrics :
186        SortedOpMetricsDb(device_tf_op_metrics_db, kNumTopOpsShown)) {
187     OverviewTfOp* op = analysis.add_top_device_ops();
188     op->set_name(metrics->name());
189     op->set_category(metrics->category());
190     op->set_self_time_fraction(
191         SafeDivide(metrics->self_time_ps(), total_device_time_ps));
192     device_cumulative_fraction += op->self_time_fraction();
193     op->set_cumulative_time_fraction(device_cumulative_fraction);
194     op->set_flop_rate(
195         SafeDivide(metrics->flops(), PicoToNano(metrics->time_ps())));
196     auto iter = kernel_stats_by_op_name.find(op->name());
197     if (iter != kernel_stats_by_op_name.end()) {
198       op->set_is_op_tensorcore_eligible(
199           iter->second.is_op_tensor_core_eligible);
200       op->set_is_op_using_tensorcore(iter->second.tensor_core_duration_ns != 0);
201     }
202   }
203   uint64 total_device_compute_ps =
204       op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps() +
205       op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps();
206   analysis.set_device_compute_16bit_percent(
207       100.0 *
208       SafeDivide(
209           op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps(),
210           total_device_compute_ps));
211   analysis.set_device_compute_32bit_percent(
212       100.0 *
213       SafeDivide(
214           op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
215           total_device_compute_ps));
216 
217   uint64 num_host_tf_ops = 0;
218   uint64 total_host_op_time_ps_exclude_idle = 0;
219   uint64 eager_host_op_time_ps = 0;
220   for (const OpMetrics& metrics : op_stats.host_op_metrics_db().metrics_db()) {
221     num_host_tf_ops += metrics.occurrences();
222     if (!IsIdleOp(metrics)) {
223       total_host_op_time_ps_exclude_idle += metrics.self_time_ps();
224       if (metrics.is_eager()) eager_host_op_time_ps += metrics.self_time_ps();
225     }
226   }
227   uint64 num_device_tf_ops = 0;
228   uint64 total_device_op_time_ps_exclude_idle = 0;
229   uint64 eager_device_op_time_ps = 0;
230   for (const OpMetrics& metrics : device_tf_op_metrics_db.metrics_db()) {
231     num_device_tf_ops += metrics.occurrences();
232     if (!IsIdleOp(metrics)) {
233       total_device_op_time_ps_exclude_idle += metrics.self_time_ps();
234       if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
235     }
236   }
237   // Figures out outside_compilation time from
238   // op_stats.device_op_metrics_db().metrics_db(). We don't use the
239   // {metrics.provenance(), metrics.name()} from
240   // device_tf_op_metrics_db.metrics_db(), because metrics.provenance() there is
241   // not set and metrics.name() can be either HLO-Op name or TF-Op name, which
242   // will confuse IsOutsideCompilationOp().
243   uint64 outside_compilation_device_op_time_ps = 0;
244   for (const OpMetrics& metrics :
245        op_stats.device_op_metrics_db().metrics_db()) {
246     if (!IsOutsideCompilationOp(metrics.provenance(), metrics.long_name()))
247       continue;
248     outside_compilation_device_op_time_ps += metrics.self_time_ps();
249   }
250   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
251   analysis.set_host_tf_op_percent(
252       100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
253   analysis.set_device_tf_op_percent(
254       100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
255   analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
256   analysis.set_host_op_time_eager_percent(
257       100.0 *
258       SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
259   analysis.set_device_op_time_eager_percent(
260       100.0 * SafeDivide(eager_device_op_time_ps,
261                          total_device_op_time_ps_exclude_idle));
262   analysis.set_device_op_time_outside_compilation_percent(
263       100.0 * SafeDivide(outside_compilation_device_op_time_ps,
264                          total_device_op_time_ps_exclude_idle));
265   return analysis;
266 }
267 
268 // Converts from HostIndependentJobInfo to OverviewPageHostIndependentJobInfo.
ToOverviewPageHostIndependentJobInfo(const HostIndependentJobInfoResult & host_independent_job_info)269 OverviewPageHostIndependentJobInfo ToOverviewPageHostIndependentJobInfo(
270     const HostIndependentJobInfoResult& host_independent_job_info) {
271   OverviewPageHostIndependentJobInfo result;
272   result.set_change_list(host_independent_job_info.change_list());
273   result.set_build_time(host_independent_job_info.build_time());
274   result.set_build_target(host_independent_job_info.build_target());
275   result.set_profile_duration_ms(
276       host_independent_job_info.profile_duration_ms());
277   return result;
278 }
279 
280 // Converts from HostDependentJobInfo to OverviewPageHostDependentJobInfo.
ToOverviewPageHostDependentJobInfo(const HostDependentJobInfoResult & host_dependent_job_info)281 OverviewPageHostDependentJobInfo ToOverviewPageHostDependentJobInfo(
282     const HostDependentJobInfoResult& host_dependent_job_info) {
283   OverviewPageHostDependentJobInfo result;
284   result.set_host_id(host_dependent_job_info.host_id());
285   result.set_command_line(host_dependent_job_info.command_line());
286   result.set_start_time(host_dependent_job_info.start_time());
287   result.set_bns_address(host_dependent_job_info.bns_address());
288   result.set_profile_time_ns(host_dependent_job_info.profile_time_ns());
289   return result;
290 }
291 
ComputeRunEnvironment(const RunEnvironment & run_environment)292 OverviewPageRunEnvironment ComputeRunEnvironment(
293     const RunEnvironment& run_environment) {
294   OverviewPageRunEnvironment re;
295   re.set_host_count(run_environment.host_count());
296   re.set_task_count(run_environment.task_count());
297   re.set_device_type(run_environment.device_type());
298   re.set_device_core_count(run_environment.device_core_count());
299   re.set_replica_count(run_environment.replica_count());
300   re.set_num_cores_per_replica(run_environment.num_cores_per_replica());
301   *re.mutable_host_independent_job_info() =
302       ToOverviewPageHostIndependentJobInfo(
303           run_environment.host_independent_job_info());
304   for (const auto& host_dependent_job_info :
305        run_environment.host_dependent_job_info()) {
306     *re.add_host_dependent_job_info() =
307         ToOverviewPageHostDependentJobInfo(host_dependent_job_info);
308   }
309   return re;
310 }
311 
TfFunctionRecommendationHtml(const TfFunctionDb & tf_function_db)312 std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
313   std::vector<TfFunctionInfo> candidates;
314   for (const auto& name_fun : tf_function_db.tf_functions()) {
315     const auto& fun = name_fun.second;
316     if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
317       candidates.push_back({name_fun.first, fun.expensive_call_percent()});
318     }
319   }
320   if (candidates.empty()) return "";
321   auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
322     return a.expensive_call_percent > b.expensive_call_percent;
323   };
324   // Sorts candidates in descending order of expensive_call_percent.
325   absl::c_sort(candidates, cmp);
326   std::string expensive_functions = "";
327   auto num_functions_shown = std::min(
328       static_cast<decltype(candidates)::size_type>(3), candidates.size());
329 
330   for (decltype(candidates)::size_type i = 0; i < num_functions_shown; i++) {
331     if (i > 0) absl::StrAppend(&expensive_functions, ", ");
332     absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
333                     "\"");
334   }
335   if (candidates.size() > num_functions_shown)
336     absl::StrAppend(&expensive_functions, " and more");
337   return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
338                       ") due to either retracing or eager execution.");
339 }
340 
EagerRecommendationHtml(double host_op_time_eager_percent,double device_op_time_eager_percent)341 std::string EagerRecommendationHtml(double host_op_time_eager_percent,
342                                     double device_op_time_eager_percent) {
343   std::string recommendation = "";
344   if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
345     absl::StrAppend(&recommendation, OneDigit(host_op_time_eager_percent),
346                     "% of Op time on the host used eager execution. ");
347   if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
348     absl::StrAppend(&recommendation, OneDigit(device_op_time_eager_percent),
349                     "% of Op time on the device used eager execution. ");
350   if (!recommendation.empty())
351     absl::StrAppend(&recommendation, "Performance could be improved with ",
352                     AnchorElement("https://www.tensorflow.org/guide/function",
353                                   "tf.function."));
354   return recommendation;
355 }
356 
OutsideCompilationRecommendationHtml(double device_op_time_outside_compilation_percent)357 std::string OutsideCompilationRecommendationHtml(
358     double device_op_time_outside_compilation_percent) {
359   if (device_op_time_outside_compilation_percent <=
360       kOutsideCompilationThresholdInPercent)
361     return "";
362   return absl::StrCat(
363       OneDigit(device_op_time_outside_compilation_percent),
364       " % of Op time on the device are for outside compilation. Performance "
365       "could be improved by avoiding outside compilation.");
366 }
367 
ConvertOpStatsToOverviewPage(const OpStats & op_stats)368 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
369   OverviewPage overview_page;
370   *overview_page.mutable_run_environment() =
371       ComputeRunEnvironment(op_stats.run_environment());
372   *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
373   *overview_page.mutable_input_analysis() =
374       ConvertOpStatsToInputPipelineAnalysis(op_stats);
375   BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
376       overview_page.input_analysis().input_time_breakdown(),
377       overview_page.input_analysis().step_details());
378   *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
379       bottleneck, op_stats.device_op_metrics_db().precision_stats());
380   SetCommonRecommendation(
381       bottleneck.input_classification(), bottleneck.input_statement(), "",
382       ParseHardwareType(op_stats.run_environment().device_type()),
383       TfFunctionRecommendationHtml(op_stats.tf_function_db()),
384       EagerRecommendationHtml(
385           overview_page.analysis().host_op_time_eager_percent(),
386           overview_page.analysis().device_op_time_eager_percent()),
387       OutsideCompilationRecommendationHtml(
388           overview_page.analysis()
389               .device_op_time_outside_compilation_percent()),
390       overview_page.mutable_recommendation());
391   PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
392   overview_page.mutable_analysis()->set_mxu_utilization_percent(
393       op_stats.performance_counter_result().matrix_unit_utilization_percent());
394   return overview_page;
395 }
396 
397 }  // namespace profiler
398 }  // namespace tensorflow
399