1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
17
18 #include <string>
19
20 #include "google/protobuf/any.pb.h"
21 #include "absl/strings/str_cat.h"
22 #include "tensorflow/core/platform/types.h"
23 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
24 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
25 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
26 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
27 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
28 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
29 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
30 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
31 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
32 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
33 #include "tensorflow/core/profiler/utils/diagnostics.h"
34 #include "tensorflow/core/profiler/utils/format_utils.h"
35 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
36 #include "tensorflow/core/profiler/utils/html_utils.h"
37 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
38 #include "tensorflow/core/profiler/utils/math_utils.h"
39 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
40 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
41 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
42 #include "tensorflow/core/profiler/utils/xplane_schema.h"
43 #include "tensorflow/core/profiler/utils/xplane_utils.h"
44
45 namespace tensorflow {
46 namespace profiler {
47
48 namespace {
49
50 // If the use of low-precision ops is less than this percentage threshold, a
51 // statement of suggestion will be made.
52 constexpr double kLowPrecisionPercentThreshold = 10;
53
54 struct TfFunctionInfo {
55 absl::string_view function_name;
56 double expensive_call_percent;
57 };
58
MakeOverviewPageTip(std::string text)59 OverviewPageTip MakeOverviewPageTip(std::string text) {
60 OverviewPageTip tip;
61 tip.set_link(std::move(text));
62 return tip;
63 }
64
65 // Makes a recommendation for looking up a document.
66 // doc_url is expected to be already be escaped suitably for use in an HTML
67 // attribute.
MakeOverviewPageTipDocLink(absl::string_view doc_url,absl::string_view text)68 OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
69 absl::string_view text) {
70 return MakeOverviewPageTip(AnchorElement(doc_url, text));
71 }
72
ComputeHostTips(OverviewPageRecommendation * re)73 void ComputeHostTips(OverviewPageRecommendation* re) {
74 *re->add_host_tips() = MakeOverviewPageTip(
75 "input_pipeline_analyzer (especially Section 3 for the breakdown of "
76 "input operations on the Host)");
77 *re->add_host_tips() = MakeOverviewPageTip(
78 "tf_data_bottleneck_analysis (find the bottleneck in the tf.data input "
79 "pipeline)");
80 *re->add_host_tips() = MakeOverviewPageTip(
81 "trace_viewer (look at the activities on the timeline of each Host "
82 "Thread near the bottom of the trace view)");
83 }
84
ComputeDeviceTips(HardwareType hardware_type,OverviewPageRecommendation * re)85 void ComputeDeviceTips(HardwareType hardware_type,
86 OverviewPageRecommendation* re) {
87 absl::string_view device_name = HardwareType_Name(hardware_type);
88 absl::string_view timeline_name = device_name;
89 absl::string_view op_stats_toolname = "tensorflow_stats";
90 if (hardware_type == tensorflow::profiler::TPU) {
91 timeline_name = "TPU core";
92 op_stats_toolname = "op_profile";
93 }
94 *re->add_device_tips() = MakeOverviewPageTip(
95 absl::StrCat(op_stats_toolname,
96 " (identify the time-consuming operations "
97 "executed on the ",
98 device_name, ")"));
99 *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
100 "trace_viewer (look at the activities on the timeline of each ",
101 timeline_name, " in the trace view)"));
102 }
103
ComputeFaqTips(OverviewPageRecommendation * re)104 void ComputeFaqTips(OverviewPageRecommendation* re) {
105 *re->add_faq_tips() = MakeOverviewPageTip("Refer to the TF2 Profiler FAQ");
106 }
107
ComputeDocumentationTips(OverviewPageRecommendation * re)108 void ComputeDocumentationTips(OverviewPageRecommendation* re) {
109 *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
110 "https://www.tensorflow.org/guide/data_performance_analysis",
111 "Analyze tf.data performance with the TF Profiler");
112 *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
113 "https://www.tensorflow.org/guide/"
114 "data_performance",
115 "Better performance with the tf.data API");
116 }
117
GeneratePrecisionStatement(const PrecisionStats & precision_stats)118 std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
119 uint64 total_compute_ps =
120 precision_stats.compute_16bit_ps() + precision_stats.compute_32bit_ps();
121 if (total_compute_ps > 0) {
122 double percent_16bit =
123 (100.0 * precision_stats.compute_16bit_ps()) / total_compute_ps;
124 if (percent_16bit < kLowPrecisionPercentThreshold) {
125 return absl::StrCat(
126 "Only ", OneDigit(percent_16bit),
127 "% of device computation is 16 bit. So you might want to replace "
128 "more 32-bit Ops by 16-bit Ops to improve performance (if the "
129 "reduced accuracy is acceptable).");
130 }
131 }
132 return "";
133 }
134
135 } // namespace
136
SetCommonRecommendation(absl::string_view input_classification,absl::string_view input_statement,absl::string_view output_statement,HardwareType hardware_type,absl::string_view tf_function_statement_html,absl::string_view eager_statement_html,absl::string_view outside_compilation_statement_html,OverviewPageRecommendation * re)137 void SetCommonRecommendation(
138 absl::string_view input_classification, absl::string_view input_statement,
139 absl::string_view output_statement, HardwareType hardware_type,
140 absl::string_view tf_function_statement_html,
141 absl::string_view eager_statement_html,
142 absl::string_view outside_compilation_statement_html,
143 OverviewPageRecommendation* re) {
144 re->set_bottleneck(std::string(input_classification));
145 re->set_statement(std::string(input_statement));
146 re->set_output_statement(std::string(output_statement));
147 re->set_tf_function_statement_html(std::string(tf_function_statement_html));
148 re->set_eager_statement_html(std::string(eager_statement_html));
149 re->set_outside_compilation_statement_html(
150 std::string(outside_compilation_statement_html));
151 ComputeHostTips(re);
152 ComputeDeviceTips(hardware_type, re);
153 ComputeDocumentationTips(re);
154 ComputeFaqTips(re);
155 }
156
ComputeGenericRecommendation(const BottleneckAnalysis & bottleneck,const PrecisionStats & precision_stats)157 OverviewPageRecommendation ComputeGenericRecommendation(
158 const BottleneckAnalysis& bottleneck,
159 const PrecisionStats& precision_stats) {
160 OverviewPageRecommendation re;
161 GenericRecommendation generic;
162 generic.set_device_collectives_bottleneck(
163 bottleneck.device_collectives_classification());
164 generic.set_device_collectives_statement(
165 bottleneck.device_collectives_statement());
166 generic.set_kernel_launch_bottleneck(
167 bottleneck.kernel_launch_classification());
168 generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
169 generic.set_all_other_bottleneck(bottleneck.all_other_classification());
170 generic.set_all_other_statement(bottleneck.all_other_statement());
171 generic.set_precision_statement(GeneratePrecisionStatement(precision_stats));
172 re.mutable_recommendation()->PackFrom(generic);
173 return re;
174 }
175
ComputeAnalysisResult(const OpStats & op_stats)176 OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
177 OverviewPageAnalysis analysis;
178 OpMetricsDb device_tf_op_metrics_db = CreateTfMetricsDbFromDeviceOpMetricsDb(
179 op_stats.device_op_metrics_db(), /*with_idle=*/false);
180 KernelStatsByOpName kernel_stats_by_op_name =
181 GroupKernelReportsByOpName(op_stats.kernel_stats_db());
182 uint64 total_device_time_ps = device_tf_op_metrics_db.total_time_ps();
183 constexpr int kNumTopOpsShown = 10;
184 double device_cumulative_fraction = 0.0;
185 for (const OpMetrics* metrics :
186 SortedOpMetricsDb(device_tf_op_metrics_db, kNumTopOpsShown)) {
187 OverviewTfOp* op = analysis.add_top_device_ops();
188 op->set_name(metrics->name());
189 op->set_category(metrics->category());
190 op->set_self_time_fraction(
191 SafeDivide(metrics->self_time_ps(), total_device_time_ps));
192 device_cumulative_fraction += op->self_time_fraction();
193 op->set_cumulative_time_fraction(device_cumulative_fraction);
194 op->set_flop_rate(
195 SafeDivide(metrics->flops(), PicoToNano(metrics->time_ps())));
196 auto iter = kernel_stats_by_op_name.find(op->name());
197 if (iter != kernel_stats_by_op_name.end()) {
198 op->set_is_op_tensorcore_eligible(
199 iter->second.is_op_tensor_core_eligible);
200 op->set_is_op_using_tensorcore(iter->second.tensor_core_duration_ns != 0);
201 }
202 }
203 uint64 total_device_compute_ps =
204 op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps() +
205 op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps();
206 analysis.set_device_compute_16bit_percent(
207 100.0 *
208 SafeDivide(
209 op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps(),
210 total_device_compute_ps));
211 analysis.set_device_compute_32bit_percent(
212 100.0 *
213 SafeDivide(
214 op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
215 total_device_compute_ps));
216
217 uint64 num_host_tf_ops = 0;
218 uint64 total_host_op_time_ps_exclude_idle = 0;
219 uint64 eager_host_op_time_ps = 0;
220 for (const OpMetrics& metrics : op_stats.host_op_metrics_db().metrics_db()) {
221 num_host_tf_ops += metrics.occurrences();
222 if (!IsIdleOp(metrics)) {
223 total_host_op_time_ps_exclude_idle += metrics.self_time_ps();
224 if (metrics.is_eager()) eager_host_op_time_ps += metrics.self_time_ps();
225 }
226 }
227 uint64 num_device_tf_ops = 0;
228 uint64 total_device_op_time_ps_exclude_idle = 0;
229 uint64 eager_device_op_time_ps = 0;
230 for (const OpMetrics& metrics : device_tf_op_metrics_db.metrics_db()) {
231 num_device_tf_ops += metrics.occurrences();
232 if (!IsIdleOp(metrics)) {
233 total_device_op_time_ps_exclude_idle += metrics.self_time_ps();
234 if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
235 }
236 }
237 // Figures out outside_compilation time from
238 // op_stats.device_op_metrics_db().metrics_db(). We don't use the
239 // {metrics.provenance(), metrics.name()} from
240 // device_tf_op_metrics_db.metrics_db(), because metrics.provenance() there is
241 // not set and metrics.name() can be either HLO-Op name or TF-Op name, which
242 // will confuse IsOutsideCompilationOp().
243 uint64 outside_compilation_device_op_time_ps = 0;
244 for (const OpMetrics& metrics :
245 op_stats.device_op_metrics_db().metrics_db()) {
246 if (!IsOutsideCompilationOp(metrics.provenance(), metrics.long_name()))
247 continue;
248 outside_compilation_device_op_time_ps += metrics.self_time_ps();
249 }
250 uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
251 analysis.set_host_tf_op_percent(
252 100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
253 analysis.set_device_tf_op_percent(
254 100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
255 analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
256 analysis.set_host_op_time_eager_percent(
257 100.0 *
258 SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
259 analysis.set_device_op_time_eager_percent(
260 100.0 * SafeDivide(eager_device_op_time_ps,
261 total_device_op_time_ps_exclude_idle));
262 analysis.set_device_op_time_outside_compilation_percent(
263 100.0 * SafeDivide(outside_compilation_device_op_time_ps,
264 total_device_op_time_ps_exclude_idle));
265 return analysis;
266 }
267
268 // Converts from HostIndependentJobInfo to OverviewPageHostIndependentJobInfo.
ToOverviewPageHostIndependentJobInfo(const HostIndependentJobInfoResult & host_independent_job_info)269 OverviewPageHostIndependentJobInfo ToOverviewPageHostIndependentJobInfo(
270 const HostIndependentJobInfoResult& host_independent_job_info) {
271 OverviewPageHostIndependentJobInfo result;
272 result.set_change_list(host_independent_job_info.change_list());
273 result.set_build_time(host_independent_job_info.build_time());
274 result.set_build_target(host_independent_job_info.build_target());
275 result.set_profile_duration_ms(
276 host_independent_job_info.profile_duration_ms());
277 return result;
278 }
279
280 // Converts from HostDependentJobInfo to OverviewPageHostDependentJobInfo.
ToOverviewPageHostDependentJobInfo(const HostDependentJobInfoResult & host_dependent_job_info)281 OverviewPageHostDependentJobInfo ToOverviewPageHostDependentJobInfo(
282 const HostDependentJobInfoResult& host_dependent_job_info) {
283 OverviewPageHostDependentJobInfo result;
284 result.set_host_id(host_dependent_job_info.host_id());
285 result.set_command_line(host_dependent_job_info.command_line());
286 result.set_start_time(host_dependent_job_info.start_time());
287 result.set_bns_address(host_dependent_job_info.bns_address());
288 result.set_profile_time_ns(host_dependent_job_info.profile_time_ns());
289 return result;
290 }
291
ComputeRunEnvironment(const RunEnvironment & run_environment)292 OverviewPageRunEnvironment ComputeRunEnvironment(
293 const RunEnvironment& run_environment) {
294 OverviewPageRunEnvironment re;
295 re.set_host_count(run_environment.host_count());
296 re.set_task_count(run_environment.task_count());
297 re.set_device_type(run_environment.device_type());
298 re.set_device_core_count(run_environment.device_core_count());
299 re.set_replica_count(run_environment.replica_count());
300 re.set_num_cores_per_replica(run_environment.num_cores_per_replica());
301 *re.mutable_host_independent_job_info() =
302 ToOverviewPageHostIndependentJobInfo(
303 run_environment.host_independent_job_info());
304 for (const auto& host_dependent_job_info :
305 run_environment.host_dependent_job_info()) {
306 *re.add_host_dependent_job_info() =
307 ToOverviewPageHostDependentJobInfo(host_dependent_job_info);
308 }
309 return re;
310 }
311
TfFunctionRecommendationHtml(const TfFunctionDb & tf_function_db)312 std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
313 std::vector<TfFunctionInfo> candidates;
314 for (const auto& name_fun : tf_function_db.tf_functions()) {
315 const auto& fun = name_fun.second;
316 if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
317 candidates.push_back({name_fun.first, fun.expensive_call_percent()});
318 }
319 }
320 if (candidates.empty()) return "";
321 auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
322 return a.expensive_call_percent > b.expensive_call_percent;
323 };
324 // Sorts candidates in descending order of expensive_call_percent.
325 absl::c_sort(candidates, cmp);
326 std::string expensive_functions = "";
327 auto num_functions_shown = std::min(
328 static_cast<decltype(candidates)::size_type>(3), candidates.size());
329
330 for (decltype(candidates)::size_type i = 0; i < num_functions_shown; i++) {
331 if (i > 0) absl::StrAppend(&expensive_functions, ", ");
332 absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
333 "\"");
334 }
335 if (candidates.size() > num_functions_shown)
336 absl::StrAppend(&expensive_functions, " and more");
337 return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
338 ") due to either retracing or eager execution.");
339 }
340
EagerRecommendationHtml(double host_op_time_eager_percent,double device_op_time_eager_percent)341 std::string EagerRecommendationHtml(double host_op_time_eager_percent,
342 double device_op_time_eager_percent) {
343 std::string recommendation = "";
344 if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
345 absl::StrAppend(&recommendation, OneDigit(host_op_time_eager_percent),
346 "% of Op time on the host used eager execution. ");
347 if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
348 absl::StrAppend(&recommendation, OneDigit(device_op_time_eager_percent),
349 "% of Op time on the device used eager execution. ");
350 if (!recommendation.empty())
351 absl::StrAppend(&recommendation, "Performance could be improved with ",
352 AnchorElement("https://www.tensorflow.org/guide/function",
353 "tf.function."));
354 return recommendation;
355 }
356
OutsideCompilationRecommendationHtml(double device_op_time_outside_compilation_percent)357 std::string OutsideCompilationRecommendationHtml(
358 double device_op_time_outside_compilation_percent) {
359 if (device_op_time_outside_compilation_percent <=
360 kOutsideCompilationThresholdInPercent)
361 return "";
362 return absl::StrCat(
363 OneDigit(device_op_time_outside_compilation_percent),
364 " % of Op time on the device are for outside compilation. Performance "
365 "could be improved by avoiding outside compilation.");
366 }
367
ConvertOpStatsToOverviewPage(const OpStats & op_stats)368 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
369 OverviewPage overview_page;
370 *overview_page.mutable_run_environment() =
371 ComputeRunEnvironment(op_stats.run_environment());
372 *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
373 *overview_page.mutable_input_analysis() =
374 ConvertOpStatsToInputPipelineAnalysis(op_stats);
375 BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
376 overview_page.input_analysis().input_time_breakdown(),
377 overview_page.input_analysis().step_details());
378 *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
379 bottleneck, op_stats.device_op_metrics_db().precision_stats());
380 SetCommonRecommendation(
381 bottleneck.input_classification(), bottleneck.input_statement(), "",
382 ParseHardwareType(op_stats.run_environment().device_type()),
383 TfFunctionRecommendationHtml(op_stats.tf_function_db()),
384 EagerRecommendationHtml(
385 overview_page.analysis().host_op_time_eager_percent(),
386 overview_page.analysis().device_op_time_eager_percent()),
387 OutsideCompilationRecommendationHtml(
388 overview_page.analysis()
389 .device_op_time_outside_compilation_percent()),
390 overview_page.mutable_recommendation());
391 PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
392 overview_page.mutable_analysis()->set_mxu_utilization_percent(
393 op_stats.performance_counter_result().matrix_unit_utilization_percent());
394 return overview_page;
395 }
396
397 } // namespace profiler
398 } // namespace tensorflow
399